From ff618ffa355f432e2705160b6fa522ff27b55227 Mon Sep 17 00:00:00 2001 From: max Date: Tue, 12 Mar 2024 15:34:14 +0100 Subject: [PATCH] added class method --- examples/clustering/test_cluster.ipynb | 120 ++++++++++++++++--------- pyeed/containers/__init__.py | 1 + pyeed/containers/mmseqs2.py | 57 +++++++----- 3 files changed, 116 insertions(+), 62 deletions(-) diff --git a/examples/clustering/test_cluster.ipynb b/examples/clustering/test_cluster.ipynb index 78366fed..f21c3518 100644 --- a/examples/clustering/test_cluster.ipynb +++ b/examples/clustering/test_cluster.ipynb @@ -19,48 +19,17 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'/Users/max/Documents/GitHub/pyeed/examples/networks/test_data/EQL02201.1.json'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(Path(\"test_data\").rglob(\"*.json\"))[0].absolute().__str__()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [], "source": [ "sequences = []\n", - "for path in Path(\"test_data\").rglob(\"*.json\"):\n", + "for path in Path(\"data\").rglob(\"*.json\"):\n", " with open(str(path.absolute())) as f:\n", " sequences.append(ProteinInfo.from_json(f))" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "multifasta_string = \"\\n\".join(\n", - " [f\">{seq.source_id}\\n{seq.sequence}\" for seq in sequences]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 37, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -70,27 +39,94 @@ "🏃 Clustering sequences with MMSeqs2...\n", "╭── initial sequences: 201\n", "├── min. coverage: 80 %\n", - "╰── min. sequence identity: 0 %\n", - "🎉 Clustering completed\n", - "\n" + "╰── min. sequence identity: 77 %\n", + "🎉 Clustered intitial sequences in 41 representative sequences\n" ] }, { "data": { "text/plain": [ - "[ProteinInfo(id='proteininfo190', source_id='KAE8211275.1', name='hypothetical protein', sequence='MASSASASGSSSSSSSQAQQLPTQQQQQQQSQPQSNGASSQDEAAERNSVIIKVGMVGDSQIGKTSLMVRYVEGSFNEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLSRKSTLNSIKEWYRQARGFNKTAIPFLVGTKYDQFATFAQEEQEEITKQARRFARAMKAPLIFSSTSHSINVQKIFKIILSKAFDLKCTIPEIKGIGEPLLIYVET', organism=Organism(id='organism720', name='Tilletia walkeri', taxonomy_id='117179', domain='Eukaryota', kingdom='Fungi', phylum='Basidiomycota', tax_class='Exobasidiomycetes', order='Tilletiales', family='Tilletiaceae', genus='Tilletia', species='Tilletia walkeri'), citation=Citation(id='citation25', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion395', name='Spg1', spans=[Span(id='span1478', start=51, end=233)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site7088', name='other', type='unannotated', positions=[51], cross_ref='CDD:206701'), Site(id='site7089', name='other', type='unannotated', positions=[57], cross_ref='CDD:206701'), Site(id='site7090', name='other', type='unannotated', positions=[59, 75, 81, 108, 162, 165, 197], cross_ref='CDD:206701'), Site(id='site7091', name='other', type='unannotated', positions=[65, 80], cross_ref='CDD:206701'), Site(id='site7092', name='other', type='unannotated', positions=[75, 80], cross_ref='CDD:206701'), Site(id='site7093', name='other', type='unannotated', positions=[80, 84, 98, 100], cross_ref='CDD:206701'), Site(id='site7094', name='other', type='unannotated', positions=[82], cross_ref='CDD:206701'), Site(id='site7095', name='active', type='active', positions=[83, 85, 102, 104, 111, 115, 119, 121], cross_ref='CDD:206701'), Site(id='site7096', name='other', type='unannotated', positions=[83, 86, 104, 112, 114, 116], cross_ref='CDD:206701'), Site(id='site7097', name='other', type='unannotated', positions=[83], cross_ref='CDD:206701'), Site(id='site7098', name='other', type='unannotated', positions=[100], cross_ref='CDD:206701'), Site(id='site7099', name='other', type='unannotated', positions=[105], cross_ref='CDD:206701'), Site(id='site7100', name='other', type='unannotated', positions=[108, 110], cross_ref='CDD:206701'), Site(id='site7101', name='other', type='unannotated', positions=[111], cross_ref='CDD:206701'), Site(id='site7102', name='other', type='unannotated', positions=[119], cross_ref='CDD:206701'), Site(id='site7103', name='other', type='unannotated', positions=[128], cross_ref='CDD:206701'), Site(id='site7104', name='other', type='unannotated', positions=[151], cross_ref='CDD:206701'), Site(id='site7105', name='other', type='unannotated', positions=[162], cross_ref='CDD:206701'), Site(id='site7106', name='other', type='unannotated', positions=[197], cross_ref='CDD:206701'), Site(id='site7107', name='other', type='unannotated', positions=[212], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='RDSH01000145.1', name=None, spans=[Span(id='span1486', start=10948, end=10959)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", - " ProteinInfo(id='proteininfo112', source_id='KAI8960093.1', name='small GTPase', sequence='MHIAGGPGIHEPHHRQSRSLDQGLPSGAHHDDIDPHAADPNAIDHNGQFEVPSYEPPAQSMSRPASSSGTGERQLNGEASREGNGADRTRNHVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPILVGTKYDHFVNFPREDQEEISNQAKRFAKAMRAALIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIENVGEPLLLYQSV', organism=Organism(id='organism606', name='Daldinia sp. FL1419', taxonomy_id='2614575', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Sordariomycetes', order='Xylariales', family='Hypoxylaceae', genus='Daldinia', species='Daldinia sp. FL1419'), citation=Citation(id='citation40', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion319', name='Spg1', spans=[Span(id='span1163', start=94, end=276)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5628', name='other', type='unannotated', positions=[94], cross_ref='CDD:206701'), Site(id='site5629', name='other', type='unannotated', positions=[100], cross_ref='CDD:206701'), Site(id='site5630', name='other', type='unannotated', positions=[102, 118, 124, 151, 205, 208, 240], cross_ref='CDD:206701'), Site(id='site5631', name='other', type='unannotated', positions=[108, 123], cross_ref='CDD:206701'), Site(id='site5632', name='other', type='unannotated', positions=[118, 123], cross_ref='CDD:206701'), Site(id='site5633', name='other', type='unannotated', positions=[123, 127, 141, 143], cross_ref='CDD:206701'), Site(id='site5634', name='other', type='unannotated', positions=[125], cross_ref='CDD:206701'), Site(id='site5635', name='active', type='active', positions=[126, 128, 145, 147, 154, 158, 162, 164], cross_ref='CDD:206701'), Site(id='site5636', name='other', type='unannotated', positions=[126, 129, 147, 155, 157, 159], cross_ref='CDD:206701'), Site(id='site5637', name='other', type='unannotated', positions=[126], cross_ref='CDD:206701'), Site(id='site5638', name='other', type='unannotated', positions=[143], cross_ref='CDD:206701'), Site(id='site5639', name='other', type='unannotated', positions=[148], cross_ref='CDD:206701'), Site(id='site5640', name='other', type='unannotated', positions=[151, 153], cross_ref='CDD:206701'), Site(id='site5641', name='other', type='unannotated', positions=[154], cross_ref='CDD:206701'), Site(id='site5642', name='other', type='unannotated', positions=[162], cross_ref='CDD:206701'), Site(id='site5643', name='other', type='unannotated', positions=[171], cross_ref='CDD:206701'), Site(id='site5644', name='other', type='unannotated', positions=[194], cross_ref='CDD:206701'), Site(id='site5645', name='other', type='unannotated', positions=[205], cross_ref='CDD:206701'), Site(id='site5646', name='other', type='unannotated', positions=[240], cross_ref='CDD:206701'), Site(id='site5647', name='other', type='unannotated', positions=[255], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='MU623420.1', name=None, spans=[Span(id='span1166', start=59976, end=60074)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", - " ProteinInfo(id='proteininfo26', source_id='SCV02624.1', name='LAMI_0H01266g1_1', sequence='MSTNEANVPTLPQPKNTFTIKVGLIGDAQVGKTSLMVKYVENVFDEEYTQTLGVNCLDKKIKLGSADIVFYIMDLGGQREFINMLPLASEGARAIIFLFDLTRPETLKSIKEWHRQATGFNEMAVPLLVGTKYDLFVNFDPEYQVQVSKQSMRYAQAMDAPLIFCSTSHSINIQKIFKVIIAKLYNLTMRASEIKQIGDPLLIYKHLGSKRRFSENPSSRNSSSRTPSPHRSP', organism=Organism(id='organism620', name='Lachancea mirantina', taxonomy_id='1230905', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycetaceae', genus='Lachancea', species='Lachancea mirantina'), citation=Citation(id='citation47', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion231', name='P-loop_NTPase', spans=[Span(id='span816', start=19, end=201)], note='P-loop containing Nucleoside Triphosphate Hydrolases; cl38936', cross_reference='CDD:453896', type=None)], sites=[Site(id='site4048', name='other', type='unannotated', positions=[25], cross_ref='CDD:206648'), Site(id='site4049', name='other', type='unannotated', positions=[27, 76, 130, 133, 165], cross_ref='CDD:206648'), Site(id='site4050', name='other', type='unannotated', positions=[50], cross_ref='CDD:206648'), Site(id='site4051', name='other', type='unannotated', positions=[54], cross_ref='CDD:206648'), Site(id='site4052', name='other', type='unannotated', positions=[73], cross_ref='CDD:206648'), Site(id='site4053', name='other', type='unannotated', positions=[75, 92], cross_ref='CDD:206648'), Site(id='site4054', name='other', type='unannotated', positions=[130], cross_ref='CDD:206648'), Site(id='site4055', name='other', type='unannotated', positions=[165], cross_ref='CDD:206648')], coding_sequence_ref=DNARegion(id='LT598468.1', name=None, spans=[Span(id='span817', start=103775, end=104476)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", - " ProteinInfo(id='proteininfo195', source_id='KAJ5714723.1', name='hypothetical protein', sequence='MDPLQQPVSEVPAVAEPESQPPIEPQAPEVAQPLPMSSEQPLEHAPAPVTVNVEEPAPEDPTTDSTVGPEESTTSRVAVEPGYASDSKAHYHSRSSEFHHSAPEYNPRDTDPASRFTSSPSLTQQPQLHNASRPGSGFSSGPERQAAPQPTLPEVPHRGQLSQASRNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLVGTKYDHFVNFPREDQEEISLQAKRFAKAMKASLIFSSTSHSINVQKIFKIVLAKAFDLKCTIPEIENVGEPLLLYKSV', organism=Organism(id='organism626', name='Penicillium malachiteum', taxonomy_id='1324776', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Eurotiomycetes', order='Eurotiales', family='Aspergillaceae', genus='Penicillium', species='Penicillium malachiteum'), citation=Citation(id='citation71', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion400', name='PHA03247', spans=[Span(id='span1501', start=6, end=156)], note='large tegument protein UL36; Provisional', cross_reference='CDD:223021', type=None), ProteinRegion(id='proteinregion401', name='Spg1', spans=[Span(id='span1502', start=170, end=352)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site7188', name='other', type='unannotated', positions=[170], cross_ref='CDD:206701'), Site(id='site7189', name='other', type='unannotated', positions=[176], cross_ref='CDD:206701'), Site(id='site7190', name='other', type='unannotated', positions=[178, 194, 200, 227, 281, 284, 316], cross_ref='CDD:206701'), Site(id='site7191', name='other', type='unannotated', positions=[184, 199], cross_ref='CDD:206701'), Site(id='site7192', name='other', type='unannotated', positions=[194, 199], cross_ref='CDD:206701'), Site(id='site7193', name='other', type='unannotated', positions=[199, 203, 217, 219], cross_ref='CDD:206701'), Site(id='site7194', name='other', type='unannotated', positions=[201], cross_ref='CDD:206701'), Site(id='site7195', name='active', type='active', positions=[202, 204, 221, 223, 230, 234, 238, 240], cross_ref='CDD:206701'), Site(id='site7196', name='other', type='unannotated', positions=[202, 205, 223, 231, 233, 235], cross_ref='CDD:206701'), Site(id='site7197', name='other', type='unannotated', positions=[202], cross_ref='CDD:206701'), Site(id='site7198', name='other', type='unannotated', positions=[219], cross_ref='CDD:206701'), Site(id='site7199', name='other', type='unannotated', positions=[224], cross_ref='CDD:206701'), Site(id='site7200', name='other', type='unannotated', positions=[227, 229], cross_ref='CDD:206701'), Site(id='site7201', name='other', type='unannotated', positions=[230], cross_ref='CDD:206701'), Site(id='site7202', name='other', type='unannotated', positions=[238], cross_ref='CDD:206701'), Site(id='site7203', name='other', type='unannotated', positions=[247], cross_ref='CDD:206701'), Site(id='site7204', name='other', type='unannotated', positions=[270], cross_ref='CDD:206701'), Site(id='site7205', name='other', type='unannotated', positions=[281], cross_ref='CDD:206701'), Site(id='site7206', name='other', type='unannotated', positions=[316], cross_ref='CDD:206701'), Site(id='site7207', name='other', type='unannotated', positions=[331], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAQJAO010000006.1', name=None, spans=[Span(id='span1505', start=5087556, end=5087654)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[])]" + "[ProteinInfo(id='proteininfo181', source_id='KAI9266821.1', name='small GTPase', sequence='MATSSPSTPSLSASRDDHFSTTRNSSNEVSETTSQPPRKSIVLKTGIVGDTNIGKTSLMVKYAEGAFDEEYVQTLGVNFMEKSIIIKNTEITFTIWDLGGQKEFVSMLPLVCDDAVAILFTFDLTRKETLNSIKEWYRQARGMNLSAVPLLVGTKYDEFVHLSDNYHEEVTRQARKYARAMKAPLIFCSTAHSINVQKIYKIVLAKTFDLECTIPEISETGGPIIEYKYC', organism=Organism(id='organism774', name='Phascolomyces articulosus', taxonomy_id='60185', domain='Eukaryota', kingdom='Fungi', phylum='Mucoromycota', tax_class='Mucoromycetes', order='Mucorales', family='Lichtheimiaceae', genus='Phascolomyces', species='Phascolomyces articulosus'), citation=Citation(id='citation4', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion386', name='Spg1', spans=[Span(id='span1439', start=42, end=224)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6908', name='other', type='unannotated', positions=[42], cross_ref='CDD:206701'), Site(id='site6909', name='other', type='unannotated', positions=[48], cross_ref='CDD:206701'), Site(id='site6910', name='other', type='unannotated', positions=[50, 66, 72, 99, 153, 156, 188], cross_ref='CDD:206701'), Site(id='site6911', name='other', type='unannotated', positions=[56, 71], cross_ref='CDD:206701'), Site(id='site6912', name='other', type='unannotated', positions=[66, 71], cross_ref='CDD:206701'), Site(id='site6913', name='other', type='unannotated', positions=[71, 75, 89, 91], cross_ref='CDD:206701'), Site(id='site6914', name='other', type='unannotated', positions=[73], cross_ref='CDD:206701'), Site(id='site6915', name='active', type='active', positions=[74, 76, 93, 95, 102, 106, 110, 112], cross_ref='CDD:206701'), Site(id='site6916', name='other', type='unannotated', positions=[74, 77, 95, 103, 105, 107], cross_ref='CDD:206701'), Site(id='site6917', name='other', type='unannotated', positions=[74], cross_ref='CDD:206701'), Site(id='site6918', name='other', type='unannotated', positions=[91], cross_ref='CDD:206701'), Site(id='site6919', name='other', type='unannotated', positions=[96], cross_ref='CDD:206701'), Site(id='site6920', name='other', type='unannotated', positions=[99, 101], cross_ref='CDD:206701'), Site(id='site6921', name='other', type='unannotated', positions=[102], cross_ref='CDD:206701'), Site(id='site6922', name='other', type='unannotated', positions=[110], cross_ref='CDD:206701'), Site(id='site6923', name='other', type='unannotated', positions=[119], cross_ref='CDD:206701'), Site(id='site6924', name='other', type='unannotated', positions=[142], cross_ref='CDD:206701'), Site(id='site6925', name='other', type='unannotated', positions=[153], cross_ref='CDD:206701'), Site(id='site6926', name='other', type='unannotated', positions=[188], cross_ref='CDD:206701'), Site(id='site6927', name='other', type='unannotated', positions=[203], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAIXMP010000010.1', name=None, spans=[Span(id='span1445', start=1202621, end=1202794)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo19', source_id='KAG0659075.1', name='Ras GTPase tem1', sequence='MQQNTPTSINRHDPIPGVKNQIDIQIGLVGDAQVGKTSLMVKYVQNIFDNEYTQTLGVNFLKRKVKLRQTEIIFSLMDLGGQSEFINMLPLAAVGSSVIIFLFDLTRPVTLKSIKGWFRQAKGLNDLAIPILVGTKFDLFYTMDSQYQNEISKVAMEYSQVMDAPLIFCSTAKSIHIQKIFKIALAKLFNLTLTINEINLPGDPLLIYKDKGNNCINQSTNNNNNNTNLRRKSSHGHTSNNNNNNNTPIQASPIAVRSSYNHRHH', organism=Organism(id='organism758', name='Kazachstania unispora', taxonomy_id='27294', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycetaceae', genus='Kazachstania', species='Kazachstania unispora'), citation=Citation(id='citation8', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion224', name='P-loop_NTPase', spans=[Span(id='span802', start=23, end=205)], note='P-loop containing Nucleoside Triphosphate Hydrolases; cl38936', cross_reference='CDD:453896', type=None)], sites=[Site(id='site3944', name='other', type='unannotated', positions=[29], cross_ref='CDD:206648'), Site(id='site3945', name='other', type='unannotated', positions=[31, 80, 134, 137, 169], cross_ref='CDD:206648'), Site(id='site3946', name='other', type='unannotated', positions=[54], cross_ref='CDD:206648'), Site(id='site3947', name='other', type='unannotated', positions=[58], cross_ref='CDD:206648'), Site(id='site3948', name='other', type='unannotated', positions=[77], cross_ref='CDD:206648'), Site(id='site3949', name='other', type='unannotated', positions=[79, 96], cross_ref='CDD:206648'), Site(id='site3950', name='other', type='unannotated', positions=[134], cross_ref='CDD:206648'), Site(id='site3951', name='other', type='unannotated', positions=[169], cross_ref='CDD:206648')], coding_sequence_ref=DNARegion(id='PUHS01000053.1', name=None, spans=[Span(id='span803', start=13398, end=14195)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo118', source_id='KAI4152360.1', name='hypothetical protein', sequence='MDPNQPDAMDVSLDSGNHYGPAGHEGADDSSHSAIGYGQRPTTSQSADFPSAAYSPHTDTNSPNMAYQQQAMQTSRPASGLSDSTNRHASQDPNQRSNGQEKTQNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLVGTKFDTFVNFPREDQEEISNQARRFAKAMRASLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIESVGEPLLIYQSVA', organism=Organism(id='organism649', name='Diploschistes diacapsis', taxonomy_id='150877', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Lecanoromycetes', order='Ostropales', family='Graphidaceae', genus='Diploschistes', species='Diploschistes diacapsis'), citation=Citation(id='citation9', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion324', name='Spg1', spans=[Span(id='span1189', start=108, end=290)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5728', name='other', type='unannotated', positions=[108], cross_ref='CDD:206701'), Site(id='site5729', name='other', type='unannotated', positions=[114], cross_ref='CDD:206701'), Site(id='site5730', name='other', type='unannotated', positions=[116, 132, 138, 165, 219, 222, 254], cross_ref='CDD:206701'), Site(id='site5731', name='other', type='unannotated', positions=[122, 137], cross_ref='CDD:206701'), Site(id='site5732', name='other', type='unannotated', positions=[132, 137], cross_ref='CDD:206701'), Site(id='site5733', name='other', type='unannotated', positions=[137, 141, 155, 157], cross_ref='CDD:206701'), Site(id='site5734', name='other', type='unannotated', positions=[139], cross_ref='CDD:206701'), Site(id='site5735', name='active', type='active', positions=[140, 142, 159, 161, 168, 172, 176, 178], cross_ref='CDD:206701'), Site(id='site5736', name='other', type='unannotated', positions=[140, 143, 161, 169, 171, 173], cross_ref='CDD:206701'), Site(id='site5737', name='other', type='unannotated', positions=[140], cross_ref='CDD:206701'), Site(id='site5738', name='other', type='unannotated', positions=[157], cross_ref='CDD:206701'), Site(id='site5739', name='other', type='unannotated', positions=[162], cross_ref='CDD:206701'), Site(id='site5740', name='other', type='unannotated', positions=[165, 167], cross_ref='CDD:206701'), Site(id='site5741', name='other', type='unannotated', positions=[168], cross_ref='CDD:206701'), Site(id='site5742', name='other', type='unannotated', positions=[176], cross_ref='CDD:206701'), Site(id='site5743', name='other', type='unannotated', positions=[185], cross_ref='CDD:206701'), Site(id='site5744', name='other', type='unannotated', positions=[208], cross_ref='CDD:206701'), Site(id='site5745', name='other', type='unannotated', positions=[219], cross_ref='CDD:206701'), Site(id='site5746', name='other', type='unannotated', positions=[254], cross_ref='CDD:206701'), Site(id='site5747', name='other', type='unannotated', positions=[269], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JALAIB010000301.1', name=None, spans=[Span(id='span1192', start=17143, end=17566)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo27', source_id='SCV99388.1', name='LAFE_0A02080g1_1', sequence='MSLATSDTPVLPHPTNTFTIKVGLIGDAQVGKTSLMVKYVENVFDEEYTQTLGVNCLDKKIRLGSADILFYIMDLGGQREFINMLPLASEGAKAIIFLFDLTRPETLKSIKEWHRQATGFNENAVPLLVGTKYDLFVNMDPEYQVQVSKQSMRYAQVMDAPLIFCSTSHSINVQRIFKIIIAKIYNLTMRASEIKQIGDPLLIYKYLGNSRRRSPSS', organism=Organism(id='organism753', name='Lachancea fermentati', taxonomy_id='4955', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycetaceae', genus='Lachancea', species='Lachancea fermentati'), citation=Citation(id='citation20', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion232', name='P-loop_NTPase', spans=[Span(id='span818', start=19, end=201)], note='P-loop containing Nucleoside Triphosphate Hydrolases; cl38936', cross_reference='CDD:453896', type=None)], sites=[Site(id='site4056', name='other', type='unannotated', positions=[25], cross_ref='CDD:206648'), Site(id='site4057', name='other', type='unannotated', positions=[27, 76, 130, 133, 165], cross_ref='CDD:206648'), Site(id='site4058', name='other', type='unannotated', positions=[50], cross_ref='CDD:206648'), Site(id='site4059', name='other', type='unannotated', positions=[54], cross_ref='CDD:206648'), Site(id='site4060', name='other', type='unannotated', positions=[73], cross_ref='CDD:206648'), Site(id='site4061', name='other', type='unannotated', positions=[75, 92], cross_ref='CDD:206648'), Site(id='site4062', name='other', type='unannotated', positions=[130], cross_ref='CDD:206648'), Site(id='site4063', name='other', type='unannotated', positions=[165], cross_ref='CDD:206648')], coding_sequence_ref=DNARegion(id='LT598487.1', name=None, spans=[Span(id='span819', start=207838, end=208491)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo48', source_id='OBA26219.1', name='small GTPase', sequence='MESMLQSHNRRRHTNSMSISGSSNNNVHSELRQNNNNRISSMPSSQPASNSNNKPVDNIKLTEKDMNKYIDKDLVPKTTFKLKVGIIGDAQVGKTTLMCKYVNSAFDDEYIQTLGIHHLQKKETLKYSNILFTINDLGGQREFINMLPIVSEGAVAIIYLFDLTQPESLNSIKEWYRQAKGLNEKAISILVGTKYDLFLDLDPEYQTNISQIATLYSEAMNAPLIFASTAASINVKIIFKVIVAKAFGLDLAVPEITQIGDPLLIYKKLGNVIKQIKR', organism=Organism(id='organism610', name='Hanseniaspora valbyensis NRRL Y-1626', taxonomy_id='766949', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycodaceae', genus='Hanseniaspora', species='Hanseniaspora valbyensis'), citation=Citation(id='citation22', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion253', name='P-loop_NTPase', spans=[Span(id='span899', start=81, end=263)], note='P-loop containing Nucleoside Triphosphate Hydrolases; cl38936', cross_reference='CDD:453896', type=None)], sites=[Site(id='site4440', name='other', type='unannotated', positions=[87], cross_ref='CDD:206648'), Site(id='site4441', name='other', type='unannotated', positions=[89, 138, 192, 195, 227], cross_ref='CDD:206648'), Site(id='site4442', name='other', type='unannotated', positions=[112], cross_ref='CDD:206648'), Site(id='site4443', name='other', type='unannotated', positions=[116], cross_ref='CDD:206648'), Site(id='site4444', name='other', type='unannotated', positions=[135], cross_ref='CDD:206648'), Site(id='site4445', name='other', type='unannotated', positions=[137, 154], cross_ref='CDD:206648'), Site(id='site4446', name='other', type='unannotated', positions=[192], cross_ref='CDD:206648'), Site(id='site4447', name='other', type='unannotated', positions=[227], cross_ref='CDD:206648')], coding_sequence_ref=DNARegion(id='LXPE01000021.1', name=None, spans=[Span(id='span900', start=30702, end=31538)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo141', source_id='KAG2234214.1', name='hypothetical protein', sequence='MSESSSSTATTNPAVMSASVSTQGSQLDSAPTSSVVLKMGIIGDAQIGKTSLMIKYAEGAYDTEYIQTLGVNFMEKTILIRKTEITFSIWDLGGQKEFASMLPLVCNDSVAILFTFDLSRKSTLNSLREWYRQARGLNKSAIPLLVGTKYDEFVNLSYEDQEEITRQSRKYARVMKAPLVFCSTAESINVQKIYKIVLAKVFDLNCTLPEITEIGGPIIEYKHC', organism=Organism(id='organism738', name='Thamnidium elegans', taxonomy_id='101142', domain='Eukaryota', kingdom='Fungi', phylum='Mucoromycota', tax_class='Mucoromycetes', order='Mucorales', family='Mucoraceae', genus='Thamnidium', species='Thamnidium elegans'), citation=Citation(id='citation39', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion345', name='Spg1', spans=[Span(id='span1270', start=36, end=218)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6128', name='other', type='unannotated', positions=[36], cross_ref='CDD:206701'), Site(id='site6129', name='other', type='unannotated', positions=[42], cross_ref='CDD:206701'), Site(id='site6130', name='other', type='unannotated', positions=[44, 60, 66, 93, 147, 150, 182], cross_ref='CDD:206701'), Site(id='site6131', name='other', type='unannotated', positions=[50, 65], cross_ref='CDD:206701'), Site(id='site6132', name='other', type='unannotated', positions=[60, 65], cross_ref='CDD:206701'), Site(id='site6133', name='other', type='unannotated', positions=[65, 69, 83, 85], cross_ref='CDD:206701'), Site(id='site6134', name='other', type='unannotated', positions=[67], cross_ref='CDD:206701'), Site(id='site6135', name='active', type='active', positions=[68, 70, 87, 89, 96, 100, 104, 106], cross_ref='CDD:206701'), Site(id='site6136', name='other', type='unannotated', positions=[68, 71, 89, 97, 99, 101], cross_ref='CDD:206701'), Site(id='site6137', name='other', type='unannotated', positions=[68], cross_ref='CDD:206701'), Site(id='site6138', name='other', type='unannotated', positions=[85], cross_ref='CDD:206701'), Site(id='site6139', name='other', type='unannotated', positions=[90], cross_ref='CDD:206701'), Site(id='site6140', name='other', type='unannotated', positions=[93, 95], cross_ref='CDD:206701'), Site(id='site6141', name='other', type='unannotated', positions=[96], cross_ref='CDD:206701'), Site(id='site6142', name='other', type='unannotated', positions=[104], cross_ref='CDD:206701'), Site(id='site6143', name='other', type='unannotated', positions=[113], cross_ref='CDD:206701'), Site(id='site6144', name='other', type='unannotated', positions=[136], cross_ref='CDD:206701'), Site(id='site6145', name='other', type='unannotated', positions=[147], cross_ref='CDD:206701'), Site(id='site6146', name='other', type='unannotated', positions=[182], cross_ref='CDD:206701'), Site(id='site6147', name='other', type='unannotated', positions=[197], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAEPRE010000058.1', name=None, spans=[Span(id='span1275', start=17551, end=17808)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo24', source_id='SMN21971.1', name='similar to Saccharomyces cerevisiae YML064C TEM1 GTP-binding protein of the ras superfamily involved in termination of M-phase', sequence='MNSEKNKVSPREEIQLQIGIIGDAQVGKTSLMVKYVQDIYNKEYTQTLGVNFLKKRIRLRSTDITLSLMDLGGQREFINMLPLAAMDSYCIILLFDLTRPETLKSVKEWYRQAFGLNKNAIPILVGTKYDLFIEMDQDYQEEISRTCLKYAQIMDSPVIFSSSAYSINIQKLFKIIISKIFNLTLTLAEISDIGDPLLIYKEFGNTHLNGL', organism=Organism(id='organism652', name='Kazachstania saulgeensis', taxonomy_id='1789683', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycetaceae', genus='Kazachstania', species='Kazachstania saulgeensis'), citation=Citation(id='citation44', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion229', name='P-loop_NTPase', spans=[Span(id='span812', start=15, end=197)], note='P-loop containing Nucleoside Triphosphate Hydrolases; cl38936', cross_reference='CDD:453896', type=None)], sites=[Site(id='site4032', name='other', type='unannotated', positions=[21], cross_ref='CDD:206648'), Site(id='site4033', name='other', type='unannotated', positions=[23, 72, 126, 129, 161], cross_ref='CDD:206648'), Site(id='site4034', name='other', type='unannotated', positions=[46], cross_ref='CDD:206648'), Site(id='site4035', name='other', type='unannotated', positions=[50], cross_ref='CDD:206648'), Site(id='site4036', name='other', type='unannotated', positions=[69], cross_ref='CDD:206648'), Site(id='site4037', name='other', type='unannotated', positions=[71, 88], cross_ref='CDD:206648'), Site(id='site4038', name='other', type='unannotated', positions=[126], cross_ref='CDD:206648'), Site(id='site4039', name='other', type='unannotated', positions=[161], cross_ref='CDD:206648')], coding_sequence_ref=DNARegion(id='FXLY01000009.1', name=None, spans=[Span(id='span813', start=680896, end=681531)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo44', source_id='KAF8249497.1', name='small GTPase', sequence='MQQHDDDALPQPQTTELPLAPSLPPPVALAHSPPTPRRRNSSPTRHYHSSSNASGAPPYIGSPGSVESAQYYNSQGSPPTPAAPHQQQQFATSRADGGYHNGGGYAQQQLEAKRNSVVIKVGMVGDAQIGKTSLMVKYVEGSFDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQARGFNKTAIPFLVGTKYDHFVNFPREDQEEISKQARKFARAMKASLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIENVGEPLLLYQDV', organism=Organism(id='organism717', name='Wilcoxina mikolae CBS 423.85', taxonomy_id='1314677', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Pezizomycetes', order='Pezizales', family='Pyronemataceae', genus='Wilcoxina', species='Wilcoxina mikolae'), citation=Citation(id='citation48', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion249', name='Spg1', spans=[Span(id='span875', start=118, end=300)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site4360', name='other', type='unannotated', positions=[118], cross_ref='CDD:206701'), Site(id='site4361', name='other', type='unannotated', positions=[124], cross_ref='CDD:206701'), Site(id='site4362', name='other', type='unannotated', positions=[126, 142, 148, 175, 229, 232, 264], cross_ref='CDD:206701'), Site(id='site4363', name='other', type='unannotated', positions=[132, 147], cross_ref='CDD:206701'), Site(id='site4364', name='other', type='unannotated', positions=[142, 147], cross_ref='CDD:206701'), Site(id='site4365', name='other', type='unannotated', positions=[147, 151, 165, 167], cross_ref='CDD:206701'), Site(id='site4366', name='other', type='unannotated', positions=[149], cross_ref='CDD:206701'), Site(id='site4367', name='active', type='active', positions=[150, 152, 169, 171, 178, 182, 186, 188], cross_ref='CDD:206701'), Site(id='site4368', name='other', type='unannotated', positions=[150, 153, 171, 179, 181, 183], cross_ref='CDD:206701'), Site(id='site4369', name='other', type='unannotated', positions=[150], cross_ref='CDD:206701'), Site(id='site4370', name='other', type='unannotated', positions=[167], cross_ref='CDD:206701'), Site(id='site4371', name='other', type='unannotated', positions=[172], cross_ref='CDD:206701'), Site(id='site4372', name='other', type='unannotated', positions=[175, 177], cross_ref='CDD:206701'), Site(id='site4373', name='other', type='unannotated', positions=[178], cross_ref='CDD:206701'), Site(id='site4374', name='other', type='unannotated', positions=[186], cross_ref='CDD:206701'), Site(id='site4375', name='other', type='unannotated', positions=[195], cross_ref='CDD:206701'), Site(id='site4376', name='other', type='unannotated', positions=[218], cross_ref='CDD:206701'), Site(id='site4377', name='other', type='unannotated', positions=[229], cross_ref='CDD:206701'), Site(id='site4378', name='other', type='unannotated', positions=[264], cross_ref='CDD:206701'), Site(id='site4379', name='other', type='unannotated', positions=[279], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='MU122113.1', name=None, spans=[Span(id='span879', start=186204, end=186657)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo195', source_id='KAJ5714723.1', name='hypothetical protein', sequence='MDPLQQPVSEVPAVAEPESQPPIEPQAPEVAQPLPMSSEQPLEHAPAPVTVNVEEPAPEDPTTDSTVGPEESTTSRVAVEPGYASDSKAHYHSRSSEFHHSAPEYNPRDTDPASRFTSSPSLTQQPQLHNASRPGSGFSSGPERQAAPQPTLPEVPHRGQLSQASRNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLVGTKYDHFVNFPREDQEEISLQAKRFAKAMKASLIFSSTSHSINVQKIFKIVLAKAFDLKCTIPEIENVGEPLLLYKSV', organism=Organism(id='organism626', name='Penicillium malachiteum', taxonomy_id='1324776', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Eurotiomycetes', order='Eurotiales', family='Aspergillaceae', genus='Penicillium', species='Penicillium malachiteum'), citation=Citation(id='citation71', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion400', name='PHA03247', spans=[Span(id='span1501', start=6, end=156)], note='large tegument protein UL36; Provisional', cross_reference='CDD:223021', type=None), ProteinRegion(id='proteinregion401', name='Spg1', spans=[Span(id='span1502', start=170, end=352)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site7188', name='other', type='unannotated', positions=[170], cross_ref='CDD:206701'), Site(id='site7189', name='other', type='unannotated', positions=[176], cross_ref='CDD:206701'), Site(id='site7190', name='other', type='unannotated', positions=[178, 194, 200, 227, 281, 284, 316], cross_ref='CDD:206701'), Site(id='site7191', name='other', type='unannotated', positions=[184, 199], cross_ref='CDD:206701'), Site(id='site7192', name='other', type='unannotated', positions=[194, 199], cross_ref='CDD:206701'), Site(id='site7193', name='other', type='unannotated', positions=[199, 203, 217, 219], cross_ref='CDD:206701'), Site(id='site7194', name='other', type='unannotated', positions=[201], cross_ref='CDD:206701'), Site(id='site7195', name='active', type='active', positions=[202, 204, 221, 223, 230, 234, 238, 240], cross_ref='CDD:206701'), Site(id='site7196', name='other', type='unannotated', positions=[202, 205, 223, 231, 233, 235], cross_ref='CDD:206701'), Site(id='site7197', name='other', type='unannotated', positions=[202], cross_ref='CDD:206701'), Site(id='site7198', name='other', type='unannotated', positions=[219], cross_ref='CDD:206701'), Site(id='site7199', name='other', type='unannotated', positions=[224], cross_ref='CDD:206701'), Site(id='site7200', name='other', type='unannotated', positions=[227, 229], cross_ref='CDD:206701'), Site(id='site7201', name='other', type='unannotated', positions=[230], cross_ref='CDD:206701'), Site(id='site7202', name='other', type='unannotated', positions=[238], cross_ref='CDD:206701'), Site(id='site7203', name='other', type='unannotated', positions=[247], cross_ref='CDD:206701'), Site(id='site7204', name='other', type='unannotated', positions=[270], cross_ref='CDD:206701'), Site(id='site7205', name='other', type='unannotated', positions=[281], cross_ref='CDD:206701'), Site(id='site7206', name='other', type='unannotated', positions=[316], cross_ref='CDD:206701'), Site(id='site7207', name='other', type='unannotated', positions=[331], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAQJAO010000006.1', name=None, spans=[Span(id='span1505', start=5087556, end=5087654)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo151', source_id='KAI0396868.1', name='septum initiation protein sid3', sequence='MDQEPHDAPMVGPHDNIDEIPAGVEEPQQMSPSHNGHDIYHQQSDDLDQDMPHHPASDEVDPNAQYETPPSTAYPQSISRPPSGLSNGGGDRHTSHSDRGSNGAEQATTRNQVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPVLVGTKYDHFVNFSREEQEEISNQARRFAKAMRASLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIENVGEPLLLYQNC', organism=Organism(id='organism756', name='Xylariaceae sp. FL0594', taxonomy_id='2614590', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Sordariomycetes', order='Xylariales', family='Xylariaceae', genus=None, species='Xylariaceae sp. FL0594'), citation=Citation(id='citation83', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion355', name='Spg1', spans=[Span(id='span1309', start=114, end=296)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6328', name='other', type='unannotated', positions=[114], cross_ref='CDD:206701'), Site(id='site6329', name='other', type='unannotated', positions=[120], cross_ref='CDD:206701'), Site(id='site6330', name='other', type='unannotated', positions=[122, 138, 144, 171, 225, 228, 260], cross_ref='CDD:206701'), Site(id='site6331', name='other', type='unannotated', positions=[128, 143], cross_ref='CDD:206701'), Site(id='site6332', name='other', type='unannotated', positions=[138, 143], cross_ref='CDD:206701'), Site(id='site6333', name='other', type='unannotated', positions=[143, 147, 161, 163], cross_ref='CDD:206701'), Site(id='site6334', name='other', type='unannotated', positions=[145], cross_ref='CDD:206701'), Site(id='site6335', name='active', type='active', positions=[146, 148, 165, 167, 174, 178, 182, 184], cross_ref='CDD:206701'), Site(id='site6336', name='other', type='unannotated', positions=[146, 149, 167, 175, 177, 179], cross_ref='CDD:206701'), Site(id='site6337', name='other', type='unannotated', positions=[146], cross_ref='CDD:206701'), Site(id='site6338', name='other', type='unannotated', positions=[163], cross_ref='CDD:206701'), Site(id='site6339', name='other', type='unannotated', positions=[168], cross_ref='CDD:206701'), Site(id='site6340', name='other', type='unannotated', positions=[171, 173], cross_ref='CDD:206701'), Site(id='site6341', name='other', type='unannotated', positions=[174], cross_ref='CDD:206701'), Site(id='site6342', name='other', type='unannotated', positions=[182], cross_ref='CDD:206701'), Site(id='site6343', name='other', type='unannotated', positions=[191], cross_ref='CDD:206701'), Site(id='site6344', name='other', type='unannotated', positions=[214], cross_ref='CDD:206701'), Site(id='site6345', name='other', type='unannotated', positions=[225], cross_ref='CDD:206701'), Site(id='site6346', name='other', type='unannotated', positions=[260], cross_ref='CDD:206701'), Site(id='site6347', name='other', type='unannotated', positions=[275], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='MU280778.1', name=None, spans=[Span(id='span1312', start=360785, end=360883)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo165', source_id='KAJ1329801.1', name='GTP-binding protein', sequence='MPFSRDFQSLPSSMDQDFPVSPPPWSPPPRSSPPRSSPPRSPFDIAPDDVPDDDTLGAIGSSPRVYSDPEPTIRTHHHSQSLDQGLATSVQHHHHQQHNSDDMEHQAGHDTPPTPQQQPQSISRPPSGLEGPADRHAAQYNDHATRGASGQQEPQNGRNQVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPVLVGTKYDHFVNFPREDQEEISNQARRFAKAMRAPLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIENVGEPLLLYQSC', organism=Organism(id='organism689', name='Microdochium nivale', taxonomy_id='5520', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Sordariomycetes', order='Xylariales', family='Microdochiaceae', genus='Microdochium', species='Microdochium nivale'), citation=Citation(id='citation94', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion370', name='Spg1', spans=[Span(id='span1370', start=162, end=344)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6608', name='other', type='unannotated', positions=[162], cross_ref='CDD:206701'), Site(id='site6609', name='other', type='unannotated', positions=[168], cross_ref='CDD:206701'), Site(id='site6610', name='other', type='unannotated', positions=[170, 186, 192, 219, 273, 276, 308], cross_ref='CDD:206701'), Site(id='site6611', name='other', type='unannotated', positions=[176, 191], cross_ref='CDD:206701'), Site(id='site6612', name='other', type='unannotated', positions=[186, 191], cross_ref='CDD:206701'), Site(id='site6613', name='other', type='unannotated', positions=[191, 195, 209, 211], cross_ref='CDD:206701'), Site(id='site6614', name='other', type='unannotated', positions=[193], cross_ref='CDD:206701'), Site(id='site6615', name='active', type='active', positions=[194, 196, 213, 215, 222, 226, 230, 232], cross_ref='CDD:206701'), Site(id='site6616', name='other', type='unannotated', positions=[194, 197, 215, 223, 225, 227], cross_ref='CDD:206701'), Site(id='site6617', name='other', type='unannotated', positions=[194], cross_ref='CDD:206701'), Site(id='site6618', name='other', type='unannotated', positions=[211], cross_ref='CDD:206701'), Site(id='site6619', name='other', type='unannotated', positions=[216], cross_ref='CDD:206701'), Site(id='site6620', name='other', type='unannotated', positions=[219, 221], cross_ref='CDD:206701'), Site(id='site6621', name='other', type='unannotated', positions=[222], cross_ref='CDD:206701'), Site(id='site6622', name='other', type='unannotated', positions=[230], cross_ref='CDD:206701'), Site(id='site6623', name='other', type='unannotated', positions=[239], cross_ref='CDD:206701'), Site(id='site6624', name='other', type='unannotated', positions=[262], cross_ref='CDD:206701'), Site(id='site6625', name='other', type='unannotated', positions=[273], cross_ref='CDD:206701'), Site(id='site6626', name='other', type='unannotated', positions=[308], cross_ref='CDD:206701'), Site(id='site6627', name='other', type='unannotated', positions=[323], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JANTFD010000005.1', name=None, spans=[Span(id='span1373', start=3077269, end=3077367)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo90', source_id='GES63634.1', name='hypothetical protein', sequence='METIHNLNTGVSEPVEQQPQESAFEASPVSHPAGSTDEPTSASVYHRSGYNSDSHAQYSSLATHQPQPPTSSRPSSGLSGPERYGPSQEVTQKQPSQPPSSQTKNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLVGTKYDHFVNFPREDQEEISIQAKRFAKAMKASLIFSSTSHSINVQKIFKIVLAKAFDLKCTIPEIENIGEPLLLYKNV', organism=Organism(id='organism741', name='Aspergillus terreus', taxonomy_id='33178', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Eurotiomycetes', order='Eurotiales', family='Aspergillaceae', genus='Aspergillus', species='Aspergillus terreus'), citation=Citation(id='citation95', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion295', name='Spg1', spans=[Span(id='span1084', start=108, end=290)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5228', name='other', type='unannotated', positions=[108], cross_ref='CDD:206701'), Site(id='site5229', name='other', type='unannotated', positions=[114], cross_ref='CDD:206701'), Site(id='site5230', name='other', type='unannotated', positions=[116, 132, 138, 165, 219, 222, 254], cross_ref='CDD:206701'), Site(id='site5231', name='other', type='unannotated', positions=[122, 137], cross_ref='CDD:206701'), Site(id='site5232', name='other', type='unannotated', positions=[132, 137], cross_ref='CDD:206701'), Site(id='site5233', name='other', type='unannotated', positions=[137, 141, 155, 157], cross_ref='CDD:206701'), Site(id='site5234', name='other', type='unannotated', positions=[139], cross_ref='CDD:206701'), Site(id='site5235', name='active', type='active', positions=[140, 142, 159, 161, 168, 172, 176, 178], cross_ref='CDD:206701'), Site(id='site5236', name='other', type='unannotated', positions=[140, 143, 161, 169, 171, 173], cross_ref='CDD:206701'), Site(id='site5237', name='other', type='unannotated', positions=[140], cross_ref='CDD:206701'), Site(id='site5238', name='other', type='unannotated', positions=[157], cross_ref='CDD:206701'), Site(id='site5239', name='other', type='unannotated', positions=[162], cross_ref='CDD:206701'), Site(id='site5240', name='other', type='unannotated', positions=[165, 167], cross_ref='CDD:206701'), Site(id='site5241', name='other', type='unannotated', positions=[168], cross_ref='CDD:206701'), Site(id='site5242', name='other', type='unannotated', positions=[176], cross_ref='CDD:206701'), Site(id='site5243', name='other', type='unannotated', positions=[185], cross_ref='CDD:206701'), Site(id='site5244', name='other', type='unannotated', positions=[208], cross_ref='CDD:206701'), Site(id='site5245', name='other', type='unannotated', positions=[219], cross_ref='CDD:206701'), Site(id='site5246', name='other', type='unannotated', positions=[254], cross_ref='CDD:206701'), Site(id='site5247', name='other', type='unannotated', positions=[269], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='BKZM02000009.1', name=None, spans=[Span(id='span1087', start=899015, end=899113)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo157', source_id='KXX77220.1', name='Septum-promoting GTP-binding protein 1', sequence='MEHQNPFVAPAQFAAPAHDTVPTDAHDDTLGGIENPGHVVSGHAMPEHAIPEHAFPDQNGFHEPHPVQSQPVDHDLGNSPQPYYPPENGDSDPTARYATPPIPAPAISRPPSGLSGQGAAYGADQASRAGSNGAPATEAQNNGRNHVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPVLVGTKYDHFVNLSREEQEEISNQARRFAKAMRAALIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEITNVGEPLLIYQSC', organism=Organism(id='organism724', name='Madurella mycetomatis', taxonomy_id='100816', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Sordariomycetes', order='Sordariales', family=None, genus='Madurella', species='Madurella mycetomatis'), citation=Citation(id='citation97', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion361', name='PRK07764', spans=[Span(id='span1335', start=9, end=139)], note='DNA polymerase III subunits gamma and tau; Validated', cross_reference='CDD:236090', type=None), ProteinRegion(id='proteinregion362', name='Spg1', spans=[Span(id='span1336', start=148, end=330)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6448', name='other', type='unannotated', positions=[148], cross_ref='CDD:206701'), Site(id='site6449', name='other', type='unannotated', positions=[154], cross_ref='CDD:206701'), Site(id='site6450', name='other', type='unannotated', positions=[156, 172, 178, 205, 259, 262, 294], cross_ref='CDD:206701'), Site(id='site6451', name='other', type='unannotated', positions=[162, 177], cross_ref='CDD:206701'), Site(id='site6452', name='other', type='unannotated', positions=[172, 177], cross_ref='CDD:206701'), Site(id='site6453', name='other', type='unannotated', positions=[177, 181, 195, 197], cross_ref='CDD:206701'), Site(id='site6454', name='other', type='unannotated', positions=[179], cross_ref='CDD:206701'), Site(id='site6455', name='active', type='active', positions=[180, 182, 199, 201, 208, 212, 216, 218], cross_ref='CDD:206701'), Site(id='site6456', name='other', type='unannotated', positions=[180, 183, 201, 209, 211, 213], cross_ref='CDD:206701'), Site(id='site6457', name='other', type='unannotated', positions=[180], cross_ref='CDD:206701'), Site(id='site6458', name='other', type='unannotated', positions=[197], cross_ref='CDD:206701'), Site(id='site6459', name='other', type='unannotated', positions=[202], cross_ref='CDD:206701'), Site(id='site6460', name='other', type='unannotated', positions=[205, 207], cross_ref='CDD:206701'), Site(id='site6461', name='other', type='unannotated', positions=[208], cross_ref='CDD:206701'), Site(id='site6462', name='other', type='unannotated', positions=[216], cross_ref='CDD:206701'), Site(id='site6463', name='other', type='unannotated', positions=[225], cross_ref='CDD:206701'), Site(id='site6464', name='other', type='unannotated', positions=[248], cross_ref='CDD:206701'), Site(id='site6465', name='other', type='unannotated', positions=[259], cross_ref='CDD:206701'), Site(id='site6466', name='other', type='unannotated', positions=[294], cross_ref='CDD:206701'), Site(id='site6467', name='other', type='unannotated', positions=[309], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='LCTW02000171.1', name=None, spans=[Span(id='span1339', start=45294, end=46130)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo178', source_id='KAJ2975468.1', name='hypothetical protein', sequence='MENGDVPLHTPQDGHDDTLGAMDSSPQVASSSHLHSNNGSHEHEPQSPEQHHPSHHEDDPAAMRYTPPVPGGISSATASRPGSEMSNPSHQAHSDYNRQGSNEPSNGRNHVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPILVGTKYDHFVNFPIQDQEEISNQARRFAKAMRAALIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIENVGEPLLLYQSV', organism=Organism(id='organism637', name='Lecanicillium fungicola', taxonomy_id='93591', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Sordariomycetes', order='Hypocreales', family='Cordycipitaceae', genus='Lecanicillium', species='Lecanicillium fungicola'), citation=Citation(id='citation118', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion383', name='Spg1', spans=[Span(id='span1431', start=112, end=294)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6848', name='other', type='unannotated', positions=[112], cross_ref='CDD:206701'), Site(id='site6849', name='other', type='unannotated', positions=[118], cross_ref='CDD:206701'), Site(id='site6850', name='other', type='unannotated', positions=[120, 136, 142, 169, 223, 226, 258], cross_ref='CDD:206701'), Site(id='site6851', name='other', type='unannotated', positions=[126, 141], cross_ref='CDD:206701'), Site(id='site6852', name='other', type='unannotated', positions=[136, 141], cross_ref='CDD:206701'), Site(id='site6853', name='other', type='unannotated', positions=[141, 145, 159, 161], cross_ref='CDD:206701'), Site(id='site6854', name='other', type='unannotated', positions=[143], cross_ref='CDD:206701'), Site(id='site6855', name='active', type='active', positions=[144, 146, 163, 165, 172, 176, 180, 182], cross_ref='CDD:206701'), Site(id='site6856', name='other', type='unannotated', positions=[144, 147, 165, 173, 175, 177], cross_ref='CDD:206701'), Site(id='site6857', name='other', type='unannotated', positions=[144], cross_ref='CDD:206701'), Site(id='site6858', name='other', type='unannotated', positions=[161], cross_ref='CDD:206701'), Site(id='site6859', name='other', type='unannotated', positions=[166], cross_ref='CDD:206701'), Site(id='site6860', name='other', type='unannotated', positions=[169, 171], cross_ref='CDD:206701'), Site(id='site6861', name='other', type='unannotated', positions=[172], cross_ref='CDD:206701'), Site(id='site6862', name='other', type='unannotated', positions=[180], cross_ref='CDD:206701'), Site(id='site6863', name='other', type='unannotated', positions=[189], cross_ref='CDD:206701'), Site(id='site6864', name='other', type='unannotated', positions=[212], cross_ref='CDD:206701'), Site(id='site6865', name='other', type='unannotated', positions=[223], cross_ref='CDD:206701'), Site(id='site6866', name='other', type='unannotated', positions=[258], cross_ref='CDD:206701'), Site(id='site6867', name='other', type='unannotated', positions=[273], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JANJQO010000704.1', name=None, spans=[Span(id='span1434', start=1243, end=1971)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo162', source_id='RJE22746.1', name='GTP-binding protein', sequence='MDAPQQPVTDTTTLTDQQASQVQQPPQVQQAPQPPPQEQPLPQEQQLPQEHQQSIDHTPSENFRMVSTEDTKPKPSYHNGYSSDSYAQYSSRPAEYSMPARQPTPQEAEQARLTSTSINYQPPPPAMSRPSSGLSSGTERHPTSQQHLEPSQRNPPAKNSVVIKVGMVGDAQIGKTSLMVKYVEGNWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLIGTKYDHFVNFPREDQEEISIQAKRFARAMKASLIFSSTSHSINVQKIFKIVLAKAFDLKCTIPEVENIGEPLLLYKSV', organism=Organism(id='organism718', name='Aspergillus sclerotialis', taxonomy_id='2070753', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Eurotiomycetes', order='Eurotiales', family='Aspergillaceae', genus='Aspergillus', species='Aspergillus sclerotialis'), citation=Citation(id='citation123', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion367', name='Spg1', spans=[Span(id='span1356', start=162, end=344)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6548', name='other', type='unannotated', positions=[162], cross_ref='CDD:206701'), Site(id='site6549', name='other', type='unannotated', positions=[168], cross_ref='CDD:206701'), Site(id='site6550', name='other', type='unannotated', positions=[170, 186, 192, 219, 273, 276, 308], cross_ref='CDD:206701'), Site(id='site6551', name='other', type='unannotated', positions=[176, 191], cross_ref='CDD:206701'), Site(id='site6552', name='other', type='unannotated', positions=[186, 191], cross_ref='CDD:206701'), Site(id='site6553', name='other', type='unannotated', positions=[191, 195, 209, 211], cross_ref='CDD:206701'), Site(id='site6554', name='other', type='unannotated', positions=[193], cross_ref='CDD:206701'), Site(id='site6555', name='active', type='active', positions=[194, 196, 213, 215, 222, 226, 230, 232], cross_ref='CDD:206701'), Site(id='site6556', name='other', type='unannotated', positions=[194, 197, 215, 223, 225, 227], cross_ref='CDD:206701'), Site(id='site6557', name='other', type='unannotated', positions=[194], cross_ref='CDD:206701'), Site(id='site6558', name='other', type='unannotated', positions=[211], cross_ref='CDD:206701'), Site(id='site6559', name='other', type='unannotated', positions=[216], cross_ref='CDD:206701'), Site(id='site6560', name='other', type='unannotated', positions=[219, 221], cross_ref='CDD:206701'), Site(id='site6561', name='other', type='unannotated', positions=[222], cross_ref='CDD:206701'), Site(id='site6562', name='other', type='unannotated', positions=[230], cross_ref='CDD:206701'), Site(id='site6563', name='other', type='unannotated', positions=[239], cross_ref='CDD:206701'), Site(id='site6564', name='other', type='unannotated', positions=[262], cross_ref='CDD:206701'), Site(id='site6565', name='other', type='unannotated', positions=[273], cross_ref='CDD:206701'), Site(id='site6566', name='other', type='unannotated', positions=[308], cross_ref='CDD:206701'), Site(id='site6567', name='other', type='unannotated', positions=[323], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='MVGC01000153.1', name=None, spans=[Span(id='span1360', start=43358, end=43456)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo78', source_id='VEU24358.1', name='DEKNAAC105557', sequence='MDNRSVSTRIPRSHSTSEALSEEPARSVSSTVDGYDNYYGHQHHHAHRHRYGEEERQSAEKNTVTIKVGLIGDAQVGKTSLMVKYVENCFDEIYTQTLGVNFMERTIRIKNTEITFSIWDLGGEAEFTNMLPLVASDAVAVLFMFDLSRKSTLNSVKDWYRQARGFNRTAIPFLVGTKYDLFVDLPDEEQEEITRQARKYAKAMNAPLIFSSTCASINIQKIFKIVISKTFDLRLKIPEIVNTGEPILLYQNV', organism=Organism(id='organism788', name='Brettanomyces naardenensis', taxonomy_id='13370', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Pichiaceae', genus='Brettanomyces', species='Brettanomyces naardenensis'), citation=Citation(id='citation126', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion285', name='Spg1', spans=[Span(id='span1031', start=65, end=247)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5028', name='other', type='unannotated', positions=[65], cross_ref='CDD:206701'), Site(id='site5029', name='other', type='unannotated', positions=[71], cross_ref='CDD:206701'), Site(id='site5030', name='other', type='unannotated', positions=[73, 89, 95, 122, 176, 179, 211], cross_ref='CDD:206701'), Site(id='site5031', name='other', type='unannotated', positions=[79, 94], cross_ref='CDD:206701'), Site(id='site5032', name='other', type='unannotated', positions=[89, 94], cross_ref='CDD:206701'), Site(id='site5033', name='other', type='unannotated', positions=[94, 98, 112, 114], cross_ref='CDD:206701'), Site(id='site5034', name='other', type='unannotated', positions=[96], cross_ref='CDD:206701'), Site(id='site5035', name='active', type='active', positions=[97, 99, 116, 118, 125, 129, 133, 135], cross_ref='CDD:206701'), Site(id='site5036', name='other', type='unannotated', positions=[97, 100, 118, 126, 128, 130], cross_ref='CDD:206701'), Site(id='site5037', name='other', type='unannotated', positions=[97], cross_ref='CDD:206701'), Site(id='site5038', name='other', type='unannotated', positions=[114], cross_ref='CDD:206701'), Site(id='site5039', name='other', type='unannotated', positions=[119], cross_ref='CDD:206701'), Site(id='site5040', name='other', type='unannotated', positions=[122, 124], cross_ref='CDD:206701'), Site(id='site5041', name='other', type='unannotated', positions=[125], cross_ref='CDD:206701'), Site(id='site5042', name='other', type='unannotated', positions=[133], cross_ref='CDD:206701'), Site(id='site5043', name='other', type='unannotated', positions=[142], cross_ref='CDD:206701'), Site(id='site5044', name='other', type='unannotated', positions=[165], cross_ref='CDD:206701'), Site(id='site5045', name='other', type='unannotated', positions=[176], cross_ref='CDD:206701'), Site(id='site5046', name='other', type='unannotated', positions=[211], cross_ref='CDD:206701'), Site(id='site5047', name='other', type='unannotated', positions=[226], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='CAACVR010000076.1', name=None, spans=[Span(id='span1032', start=185354, end=186115)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo167', source_id='KAI9619392.1', name='hypothetical protein', sequence='MESALHYNSWSASSVHYNLFGLDRVAQKAGLDDQKTKNQPTLQNSERSRQSHSFNEYDTEEPKSKGNEEASREPEQKTEILTRSTMSHSNSAMPTGSSSSSSTTTTGGAPTSSNPPPSSNQSEDKNSVVIKVGMVGDSQIGKTSLMVKYVEGSFDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLSRKSTLNSIKEWYRQARGFNKTAIPFLIGTKYDHFAAFSKDEQEEITRQSRRFAKAMRAPLIFCSTSHSINVQKIFKIVLSKAFDLKCTIPEITGSGEPLLIYLDV', organism=Organism(id='organism790', name='Puccinia striiformis f. sp. tritici PST-130', taxonomy_id='875184', domain='Eukaryota', kingdom='Fungi', phylum='Basidiomycota', tax_class='Pucciniomycetes', order='Pucciniales', family='Pucciniaceae', genus='Puccinia', species='Puccinia striiformis'), citation=Citation(id='citation136', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[], sites=[], coding_sequence_ref=DNARegion(id='JAHLQV010000245.1', name=None, spans=[Span(id='span1384', start=197279, end=197362)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo192', source_id='KAJ5811426.1', name='hypothetical protein', sequence='MDAPQQPDCETPAVAEPSQHLPEARNPPPHMQPSQQLSSESPMEQPAPVNLNAGVTSGGPMVENVSANNGYNSDHKAHYHSQSTDFHHSAPEYPAQETDPSTRMASSPSLTAQPQLHPTSRPGSGLSSGPERLGVSQPPSADPSQRQASQASRNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLIGTKYDHFVNFPREDQEEISIQAKRFAKAMKASLIFSSTSHSINVQKIFKIVLAKAFDLKCTIPEIENVGEPLLLYKSV', organism=Organism(id='organism739', name='Penicillium riverlandense', taxonomy_id='1903569', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Eurotiomycetes', order='Eurotiales', family='Aspergillaceae', genus='Penicillium', species='Penicillium riverlandense'), citation=Citation(id='citation140', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion397', name='Spg1', spans=[Span(id='span1489', start=157, end=339)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site7128', name='other', type='unannotated', positions=[157], cross_ref='CDD:206701'), Site(id='site7129', name='other', type='unannotated', positions=[163], cross_ref='CDD:206701'), Site(id='site7130', name='other', type='unannotated', positions=[165, 181, 187, 214, 268, 271, 303], cross_ref='CDD:206701'), Site(id='site7131', name='other', type='unannotated', positions=[171, 186], cross_ref='CDD:206701'), Site(id='site7132', name='other', type='unannotated', positions=[181, 186], cross_ref='CDD:206701'), Site(id='site7133', name='other', type='unannotated', positions=[186, 190, 204, 206], cross_ref='CDD:206701'), Site(id='site7134', name='other', type='unannotated', positions=[188], cross_ref='CDD:206701'), Site(id='site7135', name='active', type='active', positions=[189, 191, 208, 210, 217, 221, 225, 227], cross_ref='CDD:206701'), Site(id='site7136', name='other', type='unannotated', positions=[189, 192, 210, 218, 220, 222], cross_ref='CDD:206701'), Site(id='site7137', name='other', type='unannotated', positions=[189], cross_ref='CDD:206701'), Site(id='site7138', name='other', type='unannotated', positions=[206], cross_ref='CDD:206701'), Site(id='site7139', name='other', type='unannotated', positions=[211], cross_ref='CDD:206701'), Site(id='site7140', name='other', type='unannotated', positions=[214, 216], cross_ref='CDD:206701'), Site(id='site7141', name='other', type='unannotated', positions=[217], cross_ref='CDD:206701'), Site(id='site7142', name='other', type='unannotated', positions=[225], cross_ref='CDD:206701'), Site(id='site7143', name='other', type='unannotated', positions=[234], cross_ref='CDD:206701'), Site(id='site7144', name='other', type='unannotated', positions=[257], cross_ref='CDD:206701'), Site(id='site7145', name='other', type='unannotated', positions=[268], cross_ref='CDD:206701'), Site(id='site7146', name='other', type='unannotated', positions=[303], cross_ref='CDD:206701'), Site(id='site7147', name='other', type='unannotated', positions=[318], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAQJAV010000005.1', name=None, spans=[Span(id='span1493', start=397714, end=397812)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo196', source_id='KAI9744794.1', name='septum-promoting GTP-binding protein 1', sequence='MDNDHTYDGPADQQDDAMGGTEESHTPTHGAFMTDKRHQQSQSADLPPTHQVYNEKVDPNARYPTPPTAYPAQNSRPGSGLSGSGGYGNMQSSYQDQNARGSGGSEQPRNKNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLVGTKYDHFVNFPREDQEEISNQARRFAKAMRASLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIENVGEPLLIYQSVG', organism=Organism(id='organism692', name='Claussenomyces sp. TS43310', taxonomy_id='2939399', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Leotiomycetes', order='Leotiales', family='Tympanidaceae', genus='Claussenomyces', species='Claussenomyces sp. TS43310'), citation=Citation(id='citation145', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion402', name='Spg1', spans=[Span(id='span1506', start=115, end=297)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site7208', name='other', type='unannotated', positions=[115], cross_ref='CDD:206701'), Site(id='site7209', name='other', type='unannotated', positions=[121], cross_ref='CDD:206701'), Site(id='site7210', name='other', type='unannotated', positions=[123, 139, 145, 172, 226, 229, 261], cross_ref='CDD:206701'), Site(id='site7211', name='other', type='unannotated', positions=[129, 144], cross_ref='CDD:206701'), Site(id='site7212', name='other', type='unannotated', positions=[139, 144], cross_ref='CDD:206701'), Site(id='site7213', name='other', type='unannotated', positions=[144, 148, 162, 164], cross_ref='CDD:206701'), Site(id='site7214', name='other', type='unannotated', positions=[146], cross_ref='CDD:206701'), Site(id='site7215', name='active', type='active', positions=[147, 149, 166, 168, 175, 179, 183, 185], cross_ref='CDD:206701'), Site(id='site7216', name='other', type='unannotated', positions=[147, 150, 168, 176, 178, 180], cross_ref='CDD:206701'), Site(id='site7217', name='other', type='unannotated', positions=[147], cross_ref='CDD:206701'), Site(id='site7218', name='other', type='unannotated', positions=[164], cross_ref='CDD:206701'), Site(id='site7219', name='other', type='unannotated', positions=[169], cross_ref='CDD:206701'), Site(id='site7220', name='other', type='unannotated', positions=[172, 174], cross_ref='CDD:206701'), Site(id='site7221', name='other', type='unannotated', positions=[175], cross_ref='CDD:206701'), Site(id='site7222', name='other', type='unannotated', positions=[183], cross_ref='CDD:206701'), Site(id='site7223', name='other', type='unannotated', positions=[192], cross_ref='CDD:206701'), Site(id='site7224', name='other', type='unannotated', positions=[215], cross_ref='CDD:206701'), Site(id='site7225', name='other', type='unannotated', positions=[226], cross_ref='CDD:206701'), Site(id='site7226', name='other', type='unannotated', positions=[261], cross_ref='CDD:206701'), Site(id='site7227', name='other', type='unannotated', positions=[276], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAPETT010000005.1', name=None, spans=[Span(id='span1509', start=348173, end=348274)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo164', source_id='TQB69234.1', name='hypothetical protein', sequence='MDAPQLPEAGAPDAAEPQPPQELSQEQSQHQIPSTPLENANSPNLEVASDSAARSPNYNSGYSSDPRGNYSAGFNSPNRDYASRDGDQLSRVNSSPSINYQPQPTQQSRPPSSGVSSGPEQRQRQLPPTKNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLIGTKYDQFVNFPREDQEEISIQAKRFAKAMKASLIFSSTSHSINVQKIFKIVLAKAFDLKCTIPEIENIGEPLLLYKNV', organism=Organism(id='organism617', name='Monascus purpureus', taxonomy_id='5098', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Eurotiomycetes', order='Eurotiales', family='Aspergillaceae', genus='Monascus', species='Monascus purpureus'), citation=Citation(id='citation148', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion369', name='Spg1', spans=[Span(id='span1365', start=134, end=316)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6588', name='other', type='unannotated', positions=[134], cross_ref='CDD:206701'), Site(id='site6589', name='other', type='unannotated', positions=[140], cross_ref='CDD:206701'), Site(id='site6590', name='other', type='unannotated', positions=[142, 158, 164, 191, 245, 248, 280], cross_ref='CDD:206701'), Site(id='site6591', name='other', type='unannotated', positions=[148, 163], cross_ref='CDD:206701'), Site(id='site6592', name='other', type='unannotated', positions=[158, 163], cross_ref='CDD:206701'), Site(id='site6593', name='other', type='unannotated', positions=[163, 167, 181, 183], cross_ref='CDD:206701'), Site(id='site6594', name='other', type='unannotated', positions=[165], cross_ref='CDD:206701'), Site(id='site6595', name='active', type='active', positions=[166, 168, 185, 187, 194, 198, 202, 204], cross_ref='CDD:206701'), Site(id='site6596', name='other', type='unannotated', positions=[166, 169, 187, 195, 197, 199], cross_ref='CDD:206701'), Site(id='site6597', name='other', type='unannotated', positions=[166], cross_ref='CDD:206701'), Site(id='site6598', name='other', type='unannotated', positions=[183], cross_ref='CDD:206701'), Site(id='site6599', name='other', type='unannotated', positions=[188], cross_ref='CDD:206701'), Site(id='site6600', name='other', type='unannotated', positions=[191, 193], cross_ref='CDD:206701'), Site(id='site6601', name='other', type='unannotated', positions=[194], cross_ref='CDD:206701'), Site(id='site6602', name='other', type='unannotated', positions=[202], cross_ref='CDD:206701'), Site(id='site6603', name='other', type='unannotated', positions=[211], cross_ref='CDD:206701'), Site(id='site6604', name='other', type='unannotated', positions=[234], cross_ref='CDD:206701'), Site(id='site6605', name='other', type='unannotated', positions=[245], cross_ref='CDD:206701'), Site(id='site6606', name='other', type='unannotated', positions=[280], cross_ref='CDD:206701'), Site(id='site6607', name='other', type='unannotated', positions=[295], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='VIFY01000164.1', name=None, spans=[Span(id='span1369', start=8343, end=8844)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo23', source_id='KAG0673910.1', name='Ras GTPase tem1', sequence='MSTDKVPSLRHQVRVKVGLIGDAQVGKTSLMVKYVQNVFDEEYTQTLGVHYLERKVVLGSTDVIFSIMDLGGQREFINMLPLVSEGAVAIVFLFDLTRPETLNSIKEWYRQARGFNETAISILVGTKYDLFVEMDSKYQEEVSRTAMKYAQVMKSPLIFSSTQSSINVQKIFKVVIAKAFNITLKVPEYKQIGEPLLIYKSLGPTRPPSPNKRQVHTPPPSS', organism=Organism(id='organism771', name='Kluyveromyces marxianus', taxonomy_id='4911', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycetaceae', genus='Kluyveromyces', species='Kluyveromyces marxianus'), citation=Citation(id='citation155', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion228', name='Spg1', spans=[Span(id='span810', start=14, end=196)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site4012', name='other', type='unannotated', positions=[14], cross_ref='CDD:206701'), Site(id='site4013', name='other', type='unannotated', positions=[20], cross_ref='CDD:206701'), Site(id='site4014', name='other', type='unannotated', positions=[22, 38, 44, 71, 125, 128, 160], cross_ref='CDD:206701'), Site(id='site4015', name='other', type='unannotated', positions=[28, 43], cross_ref='CDD:206701'), Site(id='site4016', name='other', type='unannotated', positions=[38, 43], cross_ref='CDD:206701'), Site(id='site4017', name='other', type='unannotated', positions=[43, 47, 61, 63], cross_ref='CDD:206701'), Site(id='site4018', name='other', type='unannotated', positions=[45], cross_ref='CDD:206701'), Site(id='site4019', name='active', type='active', positions=[46, 48, 65, 67, 74, 78, 82, 84], cross_ref='CDD:206701'), Site(id='site4020', name='other', type='unannotated', positions=[46, 49, 67, 75, 77, 79], cross_ref='CDD:206701'), Site(id='site4021', name='other', type='unannotated', positions=[46], cross_ref='CDD:206701'), Site(id='site4022', name='other', type='unannotated', positions=[63], cross_ref='CDD:206701'), Site(id='site4023', name='other', type='unannotated', positions=[68], cross_ref='CDD:206701'), Site(id='site4024', name='other', type='unannotated', positions=[71, 73], cross_ref='CDD:206701'), Site(id='site4025', name='other', type='unannotated', positions=[74], cross_ref='CDD:206701'), Site(id='site4026', name='other', type='unannotated', positions=[82], cross_ref='CDD:206701'), Site(id='site4027', name='other', type='unannotated', positions=[91], cross_ref='CDD:206701'), Site(id='site4028', name='other', type='unannotated', positions=[114], cross_ref='CDD:206701'), Site(id='site4029', name='other', type='unannotated', positions=[125], cross_ref='CDD:206701'), Site(id='site4030', name='other', type='unannotated', positions=[160], cross_ref='CDD:206701'), Site(id='site4031', name='other', type='unannotated', positions=[175], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='PUHT01000151.1', name=None, spans=[Span(id='span811', start=11608, end=12276)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo33', source_id='OEJ86227.1', name='Protein TEM1', sequence='MTEYENGQQQQRERIRGQERESYDEQERESYDEQERERIREQERRINNEQQRRVNNEQQRRXNNQQRSTTNDNVSSNKNTHKDLDLQDNLIPRTTFKLKVGLVGDAQVGKTSLMVKYVQSVFDDEYIQTLGVHHLEKTEFLKYADILFVINDLGGQREFINMLPIVSEDAVAIVYMFDLTRPETLTSIKEWYRQAKGLNNKAISLLVGTKYDLFIEMDSEYQEKISKIATLYSZAMNAPLIFSSTSESINVKVIFKIIVAKSFNLKLTIKEIXEIGDPLLLYKNLGNKHRISH', organism=Organism(id='organism691', name='Hanseniaspora osmophila', taxonomy_id='56408', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycodaceae', genus='Hanseniaspora', species='Hanseniaspora osmophila'), citation=Citation(id='citation161', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion238', name='P-loop_NTPase', spans=[Span(id='span836', start=97, end=279)], note='P-loop containing Nucleoside Triphosphate Hydrolases; cl38936', cross_reference='CDD:453896', type=None)], sites=[Site(id='site4152', name='other', type='unannotated', positions=[103], cross_ref='CDD:206648'), Site(id='site4153', name='other', type='unannotated', positions=[105, 154, 208, 211, 243], cross_ref='CDD:206648'), Site(id='site4154', name='other', type='unannotated', positions=[128], cross_ref='CDD:206648'), Site(id='site4155', name='other', type='unannotated', positions=[132], cross_ref='CDD:206648'), Site(id='site4156', name='other', type='unannotated', positions=[151], cross_ref='CDD:206648'), Site(id='site4157', name='other', type='unannotated', positions=[153, 170], cross_ref='CDD:206648'), Site(id='site4158', name='other', type='unannotated', positions=[208], cross_ref='CDD:206648'), Site(id='site4159', name='other', type='unannotated', positions=[243], cross_ref='CDD:206648')], coding_sequence_ref=DNARegion(id='LPNM01000006.1', name=None, spans=[Span(id='span837', start=526872, end=527753)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo146', source_id='OWB55335.1', name='hypothetical protein', sequence='MEGTSQEQQHQHRSHRSHSQQQRSDSRNEIAHQNEESRSEQTNHTQETRQSQQTQQTQQTQQIQQTQQTQQTQQTQEVQQHHSGAQREKNTVTLKVGLIGDAQVGKTSLMVKYVENCFDEVYTQTLGVNYMERSINMKNTEITFTIWDLGGEAEFTNMLPLVAGDAIALLFMFDLSRKSTLNSVKDWYRQSRGFNRTAIPFLVGTKYDLFVDLSEAEQEKITRQAKNFAKAMNASLIFCSTSASINIQKIFKIVISKAFDLKLTIPEILNIGEPILLYQGI', organism=Organism(id='organism693', name='[Candida] boidinii', taxonomy_id='5477', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Pichiaceae', genus='Ogataea', species='[Candida] boidinii'), citation=Citation(id='citation167', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion350', name='Spg1', spans=[Span(id='span1290', start=93, end=275)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6228', name='other', type='unannotated', positions=[93], cross_ref='CDD:206701'), Site(id='site6229', name='other', type='unannotated', positions=[99], cross_ref='CDD:206701'), Site(id='site6230', name='other', type='unannotated', positions=[101, 117, 123, 150, 204, 207, 239], cross_ref='CDD:206701'), Site(id='site6231', name='other', type='unannotated', positions=[107, 122], cross_ref='CDD:206701'), Site(id='site6232', name='other', type='unannotated', positions=[117, 122], cross_ref='CDD:206701'), Site(id='site6233', name='other', type='unannotated', positions=[122, 126, 140, 142], cross_ref='CDD:206701'), Site(id='site6234', name='other', type='unannotated', positions=[124], cross_ref='CDD:206701'), Site(id='site6235', name='active', type='active', positions=[125, 127, 144, 146, 153, 157, 161, 163], cross_ref='CDD:206701'), Site(id='site6236', name='other', type='unannotated', positions=[125, 128, 146, 154, 156, 158], cross_ref='CDD:206701'), Site(id='site6237', name='other', type='unannotated', positions=[125], cross_ref='CDD:206701'), Site(id='site6238', name='other', type='unannotated', positions=[142], cross_ref='CDD:206701'), Site(id='site6239', name='other', type='unannotated', positions=[147], cross_ref='CDD:206701'), Site(id='site6240', name='other', type='unannotated', positions=[150, 152], cross_ref='CDD:206701'), Site(id='site6241', name='other', type='unannotated', positions=[153], cross_ref='CDD:206701'), Site(id='site6242', name='other', type='unannotated', positions=[161], cross_ref='CDD:206701'), Site(id='site6243', name='other', type='unannotated', positions=[170], cross_ref='CDD:206701'), Site(id='site6244', name='other', type='unannotated', positions=[193], cross_ref='CDD:206701'), Site(id='site6245', name='other', type='unannotated', positions=[204], cross_ref='CDD:206701'), Site(id='site6246', name='other', type='unannotated', positions=[239], cross_ref='CDD:206701'), Site(id='site6247', name='other', type='unannotated', positions=[254], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='MSRZ01000034.1', name=None, spans=[Span(id='span1291', start=49580, end=50425)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo0', source_id='UCS38941.1', name='TEM1', sequence='MSASEMRAASERVGEERNSLPSVRNQVDIQVGLIGDAQVGKTSLMVKYVQNIFDEEYTQTLGVNFLKRKVSIRSTDIVFSLMDLGGQREFINMLPIATLGSSVIILLFDLTRPETLNSIKEWYRQALGLNDSAIPILVGTKYDLFIDLEEEYQEKVSKTSMKYAQVMDAPLIFCSTAKSINVQKIFKVALAKIFDLTLTIPEINEIGDPLLIYKELGSKKNKSKNSSKPRRRSPVDNENKELVSQPLNYGHTSE', organism=Organism(id='organism403', name='Nakaseomyces glabratus', taxonomy_id='5478', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Saccharomycetaceae', genus='Nakaseomyces', species='Nakaseomyces glabratus'), citation=Citation(id='citation169', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion205', name='Spg1', spans=[Span(id='span764', start=28, end=210)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site3684', name='other', type='unannotated', positions=[28], cross_ref='CDD:206701'), Site(id='site3685', name='other', type='unannotated', positions=[34], cross_ref='CDD:206701'), Site(id='site3686', name='other', type='unannotated', positions=[36, 52, 58, 85, 139, 142, 174], cross_ref='CDD:206701'), Site(id='site3687', name='other', type='unannotated', positions=[42, 57], cross_ref='CDD:206701'), Site(id='site3688', name='other', type='unannotated', positions=[52, 57], cross_ref='CDD:206701'), Site(id='site3689', name='other', type='unannotated', positions=[57, 61, 75, 77], cross_ref='CDD:206701'), Site(id='site3690', name='other', type='unannotated', positions=[59], cross_ref='CDD:206701'), Site(id='site3691', name='active', type='active', positions=[60, 62, 79, 81, 88, 92, 96, 98], cross_ref='CDD:206701'), Site(id='site3692', name='other', type='unannotated', positions=[60, 63, 81, 89, 91, 93], cross_ref='CDD:206701'), Site(id='site3693', name='other', type='unannotated', positions=[60], cross_ref='CDD:206701'), Site(id='site3694', name='other', type='unannotated', positions=[77], cross_ref='CDD:206701'), Site(id='site3695', name='other', type='unannotated', positions=[82], cross_ref='CDD:206701'), Site(id='site3696', name='other', type='unannotated', positions=[85, 87], cross_ref='CDD:206701'), Site(id='site3697', name='other', type='unannotated', positions=[88], cross_ref='CDD:206701'), Site(id='site3698', name='other', type='unannotated', positions=[96], cross_ref='CDD:206701'), Site(id='site3699', name='other', type='unannotated', positions=[105], cross_ref='CDD:206701'), Site(id='site3700', name='other', type='unannotated', positions=[128], cross_ref='CDD:206701'), Site(id='site3701', name='other', type='unannotated', positions=[139], cross_ref='CDD:206701'), Site(id='site3702', name='other', type='unannotated', positions=[174], cross_ref='CDD:206701'), Site(id='site3703', name='other', type='unannotated', positions=[189], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='CP060154.1', name=None, spans=[Span(id='span765', start=1590335, end=1591099)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo113', source_id='CUS14399.1', name='unnamed protein product', sequence='MATTPPPLFSPSPPPPLPDSLYPDDPEEPSPVRHHHSGPAPAAVGNFGSPPPNPPPPPQSLAYQQNGAGTTPTQAHYGGGGAVDMGKRNSVVIKVGMVGDAQIGKTSLMVKYVEGSFDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQARGFNKTAIPFLVGTKYDHFVNFPREDQEEISKQARKFARAMKASLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEIEHVGEPLLLYQDVCIGLGSTVGGVGLYFFTPGFSSFLPFILHRFHLLQELESG', organism=Organism(id='organism678', name='Tuber aestivum', taxonomy_id='59557', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Pezizomycetes', order='Pezizales', family='Tuberaceae', genus='Tuber', species='Tuber aestivum'), citation=Citation(id='citation176', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion320', name='Spg1', spans=[Span(id='span1167', start=92, end=274)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5648', name='other', type='unannotated', positions=[92], cross_ref='CDD:206701'), Site(id='site5649', name='other', type='unannotated', positions=[98], cross_ref='CDD:206701'), Site(id='site5650', name='other', type='unannotated', positions=[100, 116, 122, 149, 203, 206, 238], cross_ref='CDD:206701'), Site(id='site5651', name='other', type='unannotated', positions=[106, 121], cross_ref='CDD:206701'), Site(id='site5652', name='other', type='unannotated', positions=[116, 121], cross_ref='CDD:206701'), Site(id='site5653', name='other', type='unannotated', positions=[121, 125, 139, 141], cross_ref='CDD:206701'), Site(id='site5654', name='other', type='unannotated', positions=[123], cross_ref='CDD:206701'), Site(id='site5655', name='active', type='active', positions=[124, 126, 143, 145, 152, 156, 160, 162], cross_ref='CDD:206701'), Site(id='site5656', name='other', type='unannotated', positions=[124, 127, 145, 153, 155, 157], cross_ref='CDD:206701'), Site(id='site5657', name='other', type='unannotated', positions=[124], cross_ref='CDD:206701'), Site(id='site5658', name='other', type='unannotated', positions=[141], cross_ref='CDD:206701'), Site(id='site5659', name='other', type='unannotated', positions=[146], cross_ref='CDD:206701'), Site(id='site5660', name='other', type='unannotated', positions=[149, 151], cross_ref='CDD:206701'), Site(id='site5661', name='other', type='unannotated', positions=[152], cross_ref='CDD:206701'), Site(id='site5662', name='other', type='unannotated', positions=[160], cross_ref='CDD:206701'), Site(id='site5663', name='other', type='unannotated', positions=[169], cross_ref='CDD:206701'), Site(id='site5664', name='other', type='unannotated', positions=[192], cross_ref='CDD:206701'), Site(id='site5665', name='other', type='unannotated', positions=[203], cross_ref='CDD:206701'), Site(id='site5666', name='other', type='unannotated', positions=[238], cross_ref='CDD:206701'), Site(id='site5667', name='other', type='unannotated', positions=[253], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='LN890959.1', name=None, spans=[Span(id='span1174', start=376416, end=376499)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo108', source_id='VUG17726.1', name='DEBR0S2_14884g1_1', sequence='MNSQHRSVSSSQMDADQHGSDHNLGSARSASSQLDNRNGSSHERYTHLRRDISGILPVQRAASEMITHGQPQLTTSTSLQNMQTPHNRASSEQRATSTGPNRTGSRTSTRVAEGASNSSHHSEKNTVTLKVGLVGDAQVGKTSLMVKYVENCFDEIYTQTLGVNFMERTIRIRNTEITFSIWDLGGEAEFTNMLPLVASDAVAVLFMFDLSRKITLRSVKDWYRQARGFNRTAIPFLVGTKYDLFVDLPDEQQEEITRQAKKYARAMNAPLIFTSTCASINIQKIFKIVISKTFDLRLKIPEIVNTGEPILLYQNV', organism=Organism(id='organism755', name='Brettanomyces bruxellensis', taxonomy_id='5007', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Saccharomycetes', order='Saccharomycetales', family='Pichiaceae', genus='Brettanomyces', species='Brettanomyces bruxellensis'), citation=Citation(id='citation177', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion314', name='Spg1', spans=[Span(id='span1152', start=128, end=310)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5548', name='other', type='unannotated', positions=[128], cross_ref='CDD:206701'), Site(id='site5549', name='other', type='unannotated', positions=[134], cross_ref='CDD:206701'), Site(id='site5550', name='other', type='unannotated', positions=[136, 152, 158, 185, 239, 242, 274], cross_ref='CDD:206701'), Site(id='site5551', name='other', type='unannotated', positions=[142, 157], cross_ref='CDD:206701'), Site(id='site5552', name='other', type='unannotated', positions=[152, 157], cross_ref='CDD:206701'), Site(id='site5553', name='other', type='unannotated', positions=[157, 161, 175, 177], cross_ref='CDD:206701'), Site(id='site5554', name='other', type='unannotated', positions=[159], cross_ref='CDD:206701'), Site(id='site5555', name='active', type='active', positions=[160, 162, 179, 181, 188, 192, 196, 198], cross_ref='CDD:206701'), Site(id='site5556', name='other', type='unannotated', positions=[160, 163, 181, 189, 191, 193], cross_ref='CDD:206701'), Site(id='site5557', name='other', type='unannotated', positions=[160], cross_ref='CDD:206701'), Site(id='site5558', name='other', type='unannotated', positions=[177], cross_ref='CDD:206701'), Site(id='site5559', name='other', type='unannotated', positions=[182], cross_ref='CDD:206701'), Site(id='site5560', name='other', type='unannotated', positions=[185, 187], cross_ref='CDD:206701'), Site(id='site5561', name='other', type='unannotated', positions=[188], cross_ref='CDD:206701'), Site(id='site5562', name='other', type='unannotated', positions=[196], cross_ref='CDD:206701'), Site(id='site5563', name='other', type='unannotated', positions=[205], cross_ref='CDD:206701'), Site(id='site5564', name='other', type='unannotated', positions=[228], cross_ref='CDD:206701'), Site(id='site5565', name='other', type='unannotated', positions=[239], cross_ref='CDD:206701'), Site(id='site5566', name='other', type='unannotated', positions=[274], cross_ref='CDD:206701'), Site(id='site5567', name='other', type='unannotated', positions=[289], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='CABFWN010000002.1', name=None, spans=[Span(id='span1153', start=1607711, end=1608661)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo160', source_id='KAI9730992.1', name='hypothetical protein', sequence='MEEPMSSPLPEGQYTPRETEPMEQPLSMPEDSTLSTPTYSNGYSNGTSRHQSTPSNDFSSPQHTLSPGGDTDQNSRYSSPTTYQPPPPQQQQQQQQPQQQQQQQASRPGSGLSGAGGYQHSSTYHDQNQRPPAEPSRSRNSVVIKVGMVGDAQIGKTSLMVKYVEGSWDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQGRGFNKTAIPFLIGTKYDHFVNFPREDQEEISTQARRFAKAMKASLIFSSTSHSINVQKIFKIVLSKAFDLKCTIPEISEVGEPLLLYQSVG', organism=Organism(id='organism714', name='Cirrosporium novae-zelandiae', taxonomy_id='1130053', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Xylobotryomycetes', order='Xylobotryales', family='Cirrosporiaceae', genus='Cirrosporium', species='Cirrosporium novae-zelandiae'), citation=Citation(id='citation187', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion365', name='Spg1', spans=[Span(id='span1344', start=143, end=325)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site6508', name='other', type='unannotated', positions=[143], cross_ref='CDD:206701'), Site(id='site6509', name='other', type='unannotated', positions=[149], cross_ref='CDD:206701'), Site(id='site6510', name='other', type='unannotated', positions=[151, 167, 173, 200, 254, 257, 289], cross_ref='CDD:206701'), Site(id='site6511', name='other', type='unannotated', positions=[157, 172], cross_ref='CDD:206701'), Site(id='site6512', name='other', type='unannotated', positions=[167, 172], cross_ref='CDD:206701'), Site(id='site6513', name='other', type='unannotated', positions=[172, 176, 190, 192], cross_ref='CDD:206701'), Site(id='site6514', name='other', type='unannotated', positions=[174], cross_ref='CDD:206701'), Site(id='site6515', name='active', type='active', positions=[175, 177, 194, 196, 203, 207, 211, 213], cross_ref='CDD:206701'), Site(id='site6516', name='other', type='unannotated', positions=[175, 178, 196, 204, 206, 208], cross_ref='CDD:206701'), Site(id='site6517', name='other', type='unannotated', positions=[175], cross_ref='CDD:206701'), Site(id='site6518', name='other', type='unannotated', positions=[192], cross_ref='CDD:206701'), Site(id='site6519', name='other', type='unannotated', positions=[197], cross_ref='CDD:206701'), Site(id='site6520', name='other', type='unannotated', positions=[200, 202], cross_ref='CDD:206701'), Site(id='site6521', name='other', type='unannotated', positions=[203], cross_ref='CDD:206701'), Site(id='site6522', name='other', type='unannotated', positions=[211], cross_ref='CDD:206701'), Site(id='site6523', name='other', type='unannotated', positions=[220], cross_ref='CDD:206701'), Site(id='site6524', name='other', type='unannotated', positions=[243], cross_ref='CDD:206701'), Site(id='site6525', name='other', type='unannotated', positions=[254], cross_ref='CDD:206701'), Site(id='site6526', name='other', type='unannotated', positions=[289], cross_ref='CDD:206701'), Site(id='site6527', name='other', type='unannotated', positions=[304], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='JAPETS010000032.1', name=None, spans=[Span(id='span1348', start=85104, end=85632)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[]),\n", + " ProteinInfo(id='proteininfo91', source_id='RPA74746.1', name='small GTPase', sequence='MADHHEGSSHSYDRTSHSSRPATAEKQPAPKEETKNSVVIKVGMVGDAQIGKTSLMVKYVEGSFDEDYIQTLGVNFMEKTISIRNTEITFSIWDLGGQREFVNMLPLVCNDAVAILFMFDLTRKSTLNSIKEWYRQARGFNKTAIPFLIGTKYDHYVNFPIEDQEEISKQAKRFAKAMKASLIFSSTSHSINIQKIFKIVLSKAFDLKCTIPEITEIGEPLLLYQDV', organism=Organism(id='organism611', name='Ascobolus immersus RN42', taxonomy_id='1160509', domain='Eukaryota', kingdom='Fungi', phylum='Ascomycota', tax_class='Pezizomycetes', order='Pezizales', family='Ascobolaceae', genus='Ascobolus', species='Ascobolus immersus'), citation=Citation(id='citation200', doi=None, pubmed_id=None, medline_id=None, year=None, authors=[]), family_name=None, regions=[ProteinRegion(id='proteinregion296', name='Spg1', spans=[Span(id='span1088', start=39, end=221)], note='Septum-promoting GTPase (Spg1); cd04128', cross_reference='CDD:206701', type=None)], sites=[Site(id='site5248', name='other', type='unannotated', positions=[39], cross_ref='CDD:206701'), Site(id='site5249', name='other', type='unannotated', positions=[45], cross_ref='CDD:206701'), Site(id='site5250', name='other', type='unannotated', positions=[47, 63, 69, 96, 150, 153, 185], cross_ref='CDD:206701'), Site(id='site5251', name='other', type='unannotated', positions=[53, 68], cross_ref='CDD:206701'), Site(id='site5252', name='other', type='unannotated', positions=[63, 68], cross_ref='CDD:206701'), Site(id='site5253', name='other', type='unannotated', positions=[68, 72, 86, 88], cross_ref='CDD:206701'), Site(id='site5254', name='other', type='unannotated', positions=[70], cross_ref='CDD:206701'), Site(id='site5255', name='active', type='active', positions=[71, 73, 90, 92, 99, 103, 107, 109], cross_ref='CDD:206701'), Site(id='site5256', name='other', type='unannotated', positions=[71, 74, 92, 100, 102, 104], cross_ref='CDD:206701'), Site(id='site5257', name='other', type='unannotated', positions=[71], cross_ref='CDD:206701'), Site(id='site5258', name='other', type='unannotated', positions=[88], cross_ref='CDD:206701'), Site(id='site5259', name='other', type='unannotated', positions=[93], cross_ref='CDD:206701'), Site(id='site5260', name='other', type='unannotated', positions=[96, 98], cross_ref='CDD:206701'), Site(id='site5261', name='other', type='unannotated', positions=[99], cross_ref='CDD:206701'), Site(id='site5262', name='other', type='unannotated', positions=[107], cross_ref='CDD:206701'), Site(id='site5263', name='other', type='unannotated', positions=[116], cross_ref='CDD:206701'), Site(id='site5264', name='other', type='unannotated', positions=[139], cross_ref='CDD:206701'), Site(id='site5265', name='other', type='unannotated', positions=[150], cross_ref='CDD:206701'), Site(id='site5266', name='other', type='unannotated', positions=[185], cross_ref='CDD:206701'), Site(id='site5267', name='other', type='unannotated', positions=[200], cross_ref='CDD:206701')], coding_sequence_ref=DNARegion(id='ML119781.1', name=None, spans=[Span(id='span1093', start=46954, end=47052)], note=None, cross_reference=None, type='coding sequence'), ec_number=None, mol_weight=None, substrates=[])]" ] }, - "execution_count": 37, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "MMSeqs2(coverage=0.8).cluster(sequences)" + "MMSeqs2.easy_cluster(sequences, identity=0.77)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⬇️ Fetching 8 protein entries for NCBI...\n", + "⬇️ Fetching 8 taxonomy entries for NCBI...\n", + "🏃 Clustering sequences with MMSeqs2...\n", + "╭── initial sequences: 8\n", + "├── min. coverage: 90 %\n", + "╰── min. sequence identity: 60 %\n", + "🎉 Clustered intitial sequences in 1 representative sequences\n" + ] + } + ], + "source": [ + "from pyeed.core import ProteinInfo\n", + "from pyeed.containers import MMSeqs2\n", + "\n", + "# Accessions from different methionine adenyltransferases\n", + "mat_accessions = [\n", + " \"MBP1912539.1\",\n", + " \"SEV92896.1\",\n", + " \"MBO8174569.1\",\n", + " \"WP_042680787.1\",\n", + " \"NPA47376.1\",\n", + " \"WP_167889085.1\",\n", + " \"WP_048165429.1\",\n", + " \"ACS90033.1\",\n", + "]\n", + "mats = ProteinInfo.get_ids(mat_accessions)\n", + "\n", + "# Cluster the sequences\n", + "clusters = MMSeqs2.easy_cluster(\n", + " sequences=mats,\n", + " coverage=0.9,\n", + " identity=0.6,\n", + ")" ] } ], diff --git a/pyeed/containers/__init__.py b/pyeed/containers/__init__.py index 8dba68dc..2ee6bae5 100644 --- a/pyeed/containers/__init__.py +++ b/pyeed/containers/__init__.py @@ -1 +1,2 @@ from .abstract_container import AbstractContainer, ToolImage +from .mmseqs2 import MMSeqs2 diff --git a/pyeed/containers/mmseqs2.py b/pyeed/containers/mmseqs2.py index 9b596d30..971b4f6b 100644 --- a/pyeed/containers/mmseqs2.py +++ b/pyeed/containers/mmseqs2.py @@ -23,7 +23,7 @@ class MMSeqs2(AbstractContainer): protein sequence sets. ## Attributes: - min_seq_id (float): Minimum sequence identity threshold for clustering. Value + identity (float): Minimum sequence identity threshold for clustering. Value should be between 0 and 1. Default is 0. coverage (float): Minimum fraction of aligned (covered) residues to consider a match. Value should be between 0 and 1. Default is 0.8. @@ -53,14 +53,14 @@ class MMSeqs2(AbstractContainer): description="Search mode", default="easy-cluster", ) - min_seq_id: float = Field( - description="List matches above this sequence identity (for clustering)", + identity: float = Field( + description="Minimum sequence identity threshold for clustering", default=0, ge=0, le=1, ) coverage: float = Field( - description="List matches above this fraction of aligned (covered) residues", + description="Minimum fraction of aligned (covered) residues to consider a match", default=0.8, gt=0, le=1, @@ -137,7 +137,7 @@ def run_container(self, command: str, data: str) -> Container: print("🏃 Clustering sequences with MMSeqs2...") print(f"╭── initial sequences: {data.count('>')}") print(f"├── min. coverage: {int(self.coverage*100)} %") - print(f"╰── min. sequence identity: {int(self.min_seq_id*100)} %") + print(f"╰── min. sequence identity: {int(self.identity*100)} %") logger.debug(f"🏃 Running {self._container_info.name}") self._client.containers.run( @@ -153,25 +153,23 @@ def run_container(self, command: str, data: str) -> Container: # output = container.attach(stdout=True, stream=True, logs=True) # Set detatch True to access stream - print("🎉 Clustering completed") - return self.extract_output_data() except Exception as e: logger.error(f"Error running {self._container_info} container: {e}") - def setup_command(self): + def setup_command(self, mode: str): """Sets up the command to run the MMSeqs2 container.""" command = ( "mmseqs " - f"{self.mode} " + f"{mode} " f"/app/input.fasta " f"/app/result " f"temp " f"-c {self.coverage} " f"--cov-mode {self.cov_mode} " - f"--min-seq-id {self.min_seq_id} " + f"--min-seq-id {self.identity} " f"--alignment-mode {self.alignment_mode} " f"--alignment-output-mode {self.alignment_output_mode} " f"--cluster-mode {self.cluster_mode} " @@ -179,22 +177,35 @@ def setup_command(self): return command - def cluster(self, sequences: List[AbstractSequence]) -> List[AbstractSequence]: - """Clusters a list of sequences using MMSeqs2. - - Args: - sequences (List[AbstractSequence]): `ProteinInfo` or `DNAInfo` objects to be clustered. + @classmethod + def easy_cluster( + cls, + sequences: List[AbstractSequence], + identity: float = 0, + coverage: float = 0.8, + cov_mode: int = 0, + alignment_mode: int = 3, + alignment_output_mode: int = 0, + cluster_mode: int = 0, + ) -> List[AbstractSequence]: + + mode = "easy-cluster" + instance = cls( + identity=identity, + coverage=coverage, + cov_mode=cov_mode, + alignment_mode=alignment_mode, + alignment_output_mode=alignment_output_mode, + cluster_mode=cluster_mode, + ) - Returns: - List[AbstractSequence]: Representatives of the clusters. - """ + command = instance.setup_command(mode) multifasta = "\n".join( [f">{seq.source_id}\n{seq.sequence}" for seq in sequences] ) - result = self.run_container(self.setup_command(), multifasta) - print() + result = instance.run_container(command, multifasta) ids = re.findall(r">([A-Z0-9\.]+) ", result["representatives"]) @@ -208,5 +219,11 @@ def extract_output_data(self): with open(f"{self._tempdir_path}/result_cluster.tsv") as f: results["cluster"] = f.read() + n_clusters = results["representatives"].count(">") + self._delete_temp_dir() + print( + f"🎉 Clustered intitial sequences in {n_clusters} representative sequences" + ) + return results