Skip to content

Commit

Permalink
Fix incomplete read (#47)
Browse files Browse the repository at this point in the history
* added logging, bumped version

* fixed logging in container

* implemented new parser for ncbi

* added abstract parser ABC

* added chunk loading

* added examples
  • Loading branch information
haeussma authored Mar 7, 2024
1 parent f16bacb commit 49345df
Show file tree
Hide file tree
Showing 10 changed files with 3,770 additions and 3,256 deletions.
72 changes: 69 additions & 3 deletions examples/alignments/test_pairwise.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -28,14 +28,42 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ald1 = ProteinInfo.from_ncbi(\"NP_001287541.1\")\n",
"ald2 = ProteinInfo.from_ncbi(\"AAN14384.1\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/max/miniconda3/envs/pye/lib/python3.10/site-packages/sdRDM/base/datamodel.py:322: UserWarning: No 'URL' and 'Commit' specified. This model might not be re-usable.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"'{\\n \"id\": \"proteininfo0\",\\n \"source_id\": \"NP_001287541.1\",\\n \"name\": \"aldolase 1, isoform M\",\\n \"sequence\": \"MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY\",\\n \"organism\": {\\n \"id\": \"organism0\",\\n \"name\": \"Drosophila melanogaster\",\\n \"taxonomy_id\": \"taxon:7227\",\\n \"domain\": \"Eukaryota\",\\n \"kingdom\": \"Metazoa\",\\n \"phylum\": \"Arthropoda\",\\n \"tax_class\": \"Insecta\",\\n \"order\": \"Diptera\",\\n \"family\": \"Drosophilidae\",\\n \"genus\": \"Drosophila\",\\n \"species\": \"melanogaster\"\\n },\\n \"regions\": [\\n {\\n \"id\": \"proteinregion0\",\\n \"name\": \"Glycolytic\",\\n \"spans\": [\\n {\\n \"id\": \"span0\",\\n \"start\": 14,\\n \"end\": 361\\n }\\n ],\\n \"note\": \"Fructose-bisphosphate aldolase class-I; pfam00274\",\\n \"cross_reference\": \"CDD:425574\"\\n }\\n ],\\n \"sites\": [\\n {\\n \"id\": \"site0\",\\n \"name\": \"active\",\\n \"type\": \"active\",\\n \"positions\": [\\n 33,\\n 34,\\n 35,\\n 38,\\n 107,\\n 146,\\n 148,\\n 187,\\n 229,\\n 269,\\n 270,\\n 271,\\n 299,\\n 301,\\n 302\\n ],\\n \"cross_ref\": \"CDD:188635\"\\n },\\n {\\n \"id\": \"site1\",\\n \"name\": \"intersubunit interface [polypeptide binding]\",\\n \"type\": \"unannotated\",\\n \"positions\": [\\n 110,\\n 161,\\n 164,\\n 165,\\n 168,\\n 172,\\n 175,\\n 203,\\n 207,\\n 210,\\n 217,\\n 218,\\n 220,\\n 224,\\n 256,\\n 257,\\n 259\\n ],\\n \"cross_ref\": \"CDD:188635\"\\n },\\n {\\n \"id\": \"site2\",\\n \"name\": \"catalytic residue [active]\",\\n \"type\": \"active\",\\n \"positions\": [\\n 229\\n ],\\n \"cross_ref\": \"CDD:188635\"\\n }\\n ],\\n \"coding_sequence_ref\": {\\n \"id\": \"NM_001300612.1\",\\n \"spans\": [\\n {\\n \"id\": \"span1\",\\n \"start\": 278,\\n \"end\": 1363\\n }\\n ],\\n \"type\": \"coding sequence\"\\n },\\n \"ec_number\": \"4.1.2.13\",\\n \"mol_weight\": 38916.0\\n}'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ald1.json()"
]
},
{
"cell_type": "code",
"execution_count": 8,
Expand Down Expand Up @@ -66,6 +94,44 @@
"source": [
"## BLASTP"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'TaxId': '9606', 'ScientificName': 'Homo sapiens', 'OtherNames': {'Misnomer': [], 'CommonName': [], 'Synonym': [], 'Misspelling': [], 'EquivalentName': [], 'Anamorph': [], 'GenbankAnamorph': [], 'Name': [{'ClassCDE': 'authority', 'DispName': 'Homo sapiens Linnaeus, 1758'}, {'ClassCDE': 'misspelling', 'DispName': 'Home sapiens'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sampiens'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapeins'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapian'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapians'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapien'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapience'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapiense'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapients'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo sapines'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo spaiens'}, {'ClassCDE': 'misspelling', 'DispName': 'Homo spiens'}, {'ClassCDE': 'misspelling', 'DispName': 'Humo sapiens'}], 'Teleomorph': [], 'GenbankSynonym': [], 'Inpart': [], 'Acronym': [], 'Includes': [], 'GenbankCommonName': 'human'}, 'ParentTaxId': '9605', 'Rank': 'species', 'Division': 'Primates', 'GeneticCode': {'GCId': '1', 'GCName': 'Standard'}, 'MitoGeneticCode': {'MGCId': '2', 'MGCName': 'Vertebrate Mitochondrial'}, 'Lineage': 'cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Deuterostomia; Chordata; Craniata; Vertebrata; Gnathostomata; Teleostomi; Euteleostomi; Sarcopterygii; Dipnotetrapodomorpha; Tetrapoda; Amniota; Mammalia; Theria; Eutheria; Boreoeutheria; Euarchontoglires; Primates; Haplorrhini; Simiiformes; Catarrhini; Hominoidea; Hominidae; Homininae; Homo', 'LineageEx': [{'TaxId': '131567', 'ScientificName': 'cellular organisms', 'Rank': 'no rank'}, {'TaxId': '2759', 'ScientificName': 'Eukaryota', 'Rank': 'superkingdom'}, {'TaxId': '33154', 'ScientificName': 'Opisthokonta', 'Rank': 'clade'}, {'TaxId': '33208', 'ScientificName': 'Metazoa', 'Rank': 'kingdom'}, {'TaxId': '6072', 'ScientificName': 'Eumetazoa', 'Rank': 'clade'}, {'TaxId': '33213', 'ScientificName': 'Bilateria', 'Rank': 'clade'}, {'TaxId': '33511', 'ScientificName': 'Deuterostomia', 'Rank': 'clade'}, {'TaxId': '7711', 'ScientificName': 'Chordata', 'Rank': 'phylum'}, {'TaxId': '89593', 'ScientificName': 'Craniata', 'Rank': 'subphylum'}, {'TaxId': '7742', 'ScientificName': 'Vertebrata', 'Rank': 'clade'}, {'TaxId': '7776', 'ScientificName': 'Gnathostomata', 'Rank': 'clade'}, {'TaxId': '117570', 'ScientificName': 'Teleostomi', 'Rank': 'clade'}, {'TaxId': '117571', 'ScientificName': 'Euteleostomi', 'Rank': 'clade'}, {'TaxId': '8287', 'ScientificName': 'Sarcopterygii', 'Rank': 'superclass'}, {'TaxId': '1338369', 'ScientificName': 'Dipnotetrapodomorpha', 'Rank': 'clade'}, {'TaxId': '32523', 'ScientificName': 'Tetrapoda', 'Rank': 'clade'}, {'TaxId': '32524', 'ScientificName': 'Amniota', 'Rank': 'clade'}, {'TaxId': '40674', 'ScientificName': 'Mammalia', 'Rank': 'class'}, {'TaxId': '32525', 'ScientificName': 'Theria', 'Rank': 'clade'}, {'TaxId': '9347', 'ScientificName': 'Eutheria', 'Rank': 'clade'}, {'TaxId': '1437010', 'ScientificName': 'Boreoeutheria', 'Rank': 'clade'}, {'TaxId': '314146', 'ScientificName': 'Euarchontoglires', 'Rank': 'superorder'}, {'TaxId': '9443', 'ScientificName': 'Primates', 'Rank': 'order'}, {'TaxId': '376913', 'ScientificName': 'Haplorrhini', 'Rank': 'suborder'}, {'TaxId': '314293', 'ScientificName': 'Simiiformes', 'Rank': 'infraorder'}, {'TaxId': '9526', 'ScientificName': 'Catarrhini', 'Rank': 'parvorder'}, {'TaxId': '314295', 'ScientificName': 'Hominoidea', 'Rank': 'superfamily'}, {'TaxId': '9604', 'ScientificName': 'Hominidae', 'Rank': 'family'}, {'TaxId': '207598', 'ScientificName': 'Homininae', 'Rank': 'subfamily'}, {'TaxId': '9605', 'ScientificName': 'Homo', 'Rank': 'genus'}], 'CreateDate': '1995/02/27 09:24:00', 'UpdateDate': '2021/09/24 13:17:40', 'PubDate': '1992/05/26 01:00:00'}\n"
]
}
],
"source": [
"from Bio import Entrez\n",
"from numpy import rec\n",
"\n",
"# Always provide your email address to NCBI when using the Entrez API\n",
"Entrez.email = \"your.email@example.com\"\n",
"\n",
"\n",
"def get_taxonomy(tax_id):\n",
" handle = Entrez.efetch(db=\"taxonomy\", id=str(tax_id), retmode=\"xml\")\n",
" records = Entrez.read(handle)\n",
"\n",
" # Close the handle after reading the data\n",
" handle.close()\n",
"\n",
" # Process and return the taxonomy information\n",
" return records[0]\n",
"\n",
"\n",
"# Example usage\n",
"tax_id = \"9606\" # Taxonomy ID for Homo sapiens\n",
"taxonomy_info = get_taxonomy(tax_id)\n",
"print(taxonomy_info)"
]
}
],
"metadata": {
Expand All @@ -84,7 +150,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 49345df

Please sign in to comment.