Skip to content

Commit

Permalink
All requests to NCBI API can receive an API key (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
tcezard authored Apr 12, 2024
1 parent 03892a2 commit cbd60a9
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 13 deletions.
14 changes: 8 additions & 6 deletions ebi_eva_common_pyutils/ncbi_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def get_ncbi_taxonomy_dicts_from_ids(taxonomy_ids, api_key=None):
return taxonomy_dicts


def get_ncbi_assembly_name_from_term(term):
assembl_dicts = get_ncbi_assembly_dicts_from_term(term)
def get_ncbi_assembly_name_from_term(term, api_key=None):
assembl_dicts = get_ncbi_assembly_dicts_from_term(term, api_key=api_key)
assembly_names = set([d.get('assemblyname') for d in assembl_dicts])
if len(assembly_names) > 1:
# Only keep the one that have the assembly accession as a synonymous and check again
Expand All @@ -82,8 +82,10 @@ def get_ncbi_assembly_name_from_term(term):
return assembly_names.pop() if assembly_names else None


def retrieve_species_scientific_name_from_tax_id_ncbi(taxid):
def retrieve_species_scientific_name_from_tax_id_ncbi(taxid, api_key=None):
payload = {'db': 'Taxonomy', 'id': taxid}
if api_key:
payload['api_key'] = api_key
r = requests.get(efetch_url, params=payload)
match = re.search('<Rank>(.+?)</Rank>', r.text, re.MULTILINE)
rank = None
Expand All @@ -96,9 +98,9 @@ def retrieve_species_scientific_name_from_tax_id_ncbi(taxid):
return match.group(1)


def get_species_name_from_ncbi(assembly_acc):
def get_species_name_from_ncbi(assembly_acc, api_key=None):
# We first need to search for the species associated with the assembly
assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc)
assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc, api_key=api_key)
taxids = set([assembly_dict.get('taxid')
for assembly_dict in assembly_dicts
if assembly_dict.get('assemblyaccession') == assembly_acc or
Expand All @@ -111,5 +113,5 @@ def get_species_name_from_ncbi(assembly_acc):

taxonomy_id = taxids.pop()

scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id)
scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=api_key)
return scientific_name.replace(' ', '_').lower()
13 changes: 6 additions & 7 deletions ebi_eva_internal_pyutils/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ def resolve_existing_variant_warehouse_db_name(metadata_connection_handle, assem
get_variant_warehouse_db_name_from_assembly_and_taxonomy = resolve_existing_variant_warehouse_db_name


def get_assembly_code(metadata_connection_handle, assembly):
def get_assembly_code(metadata_connection_handle, assembly, ncbi_api_key=None):
assembly_code = get_assembly_code_from_metadata(metadata_connection_handle, assembly)
if not assembly_code:
assembly_name = get_ncbi_assembly_name_from_term(assembly)
assembly_name = get_ncbi_assembly_name_from_term(assembly, api_key=ncbi_api_key)
# If the assembly is a patch assembly ex: GRCh37.p8, drop the trailing patch i.e., just return grch37
if is_patch_assembly(assembly):
assembly_name = re.sub('\\.p[0-9]+$', '', assembly_name.lower())
Expand All @@ -141,18 +141,18 @@ def get_taxonomy_code(metadata_connection_handle, taxonomy):
return taxonomy_code


def resolve_variant_warehouse_db_name(metadata_connection_handle, assembly, taxonomy):
def resolve_variant_warehouse_db_name(metadata_connection_handle, assembly, taxonomy, ncbi_api_key=None):
"""
Retrieve the database name for this taxonomy/assembly pair whether it exists or not.
It will use existing taxonomy code or assembly code if available in the metadata database.
"""
taxonomy_code = get_taxonomy_code(metadata_connection_handle, taxonomy)
assembly_code = get_assembly_code(metadata_connection_handle, assembly)
assembly_code = get_assembly_code(metadata_connection_handle, assembly, ncbi_api_key=ncbi_api_key)
return build_variant_warehouse_database_name(taxonomy_code, assembly_code)


def insert_new_assembly_and_taxonomy(metadata_connection_handle, assembly_accession, taxonomy_id, eva_species_name=None,
in_accessioning=True):
in_accessioning=True, ncbi_api_key=None):
"""
This script adds new assemblies and taxonomies to EVAPRO.
You can also add the assembly with a different taxonomy if you provide the
Expand All @@ -166,11 +166,10 @@ def insert_new_assembly_and_taxonomy(metadata_connection_handle, assembly_access
Not required if the taxonomy exists or ENA has a common name available.
:param in_accessioning: Flag that this assembly is in the accessioning data store.
"""
assembly_name = get_ncbi_assembly_name_from_term(assembly_accession)

# check if assembly is already in EVAPRO, adding it if not
assembly_set_id = get_assembly_set_from_metadata(metadata_connection_handle, taxonomy_id, assembly_accession)
if assembly_set_id is None:
assembly_name = get_ncbi_assembly_name_from_term(assembly_accession, api_key=ncbi_api_key)
ensure_taxonomy_is_in_evapro(metadata_connection_handle, taxonomy_id, eva_species_name)
assembly_code = get_assembly_code(metadata_connection_handle, assembly_accession)
insert_assembly_in_evapro(metadata_connection_handle, taxonomy_id, assembly_accession, assembly_name, assembly_code)
Expand Down

0 comments on commit cbd60a9

Please sign in to comment.