Skip to content

Commit

Permalink
Merge pull request #263 from RobokopU24/hgnc_fix
Browse files Browse the repository at this point in the history
updating HGNC file location and version date for new HGNC set up
  • Loading branch information
EvanDietzMorris authored Oct 24, 2024
2 parents b5fcc32 + f604791 commit 6b8f389
Showing 1 changed file with 20 additions and 28 deletions.
48 changes: 20 additions & 28 deletions parsers/hgnc/src/loadHGNC.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import csv
import os
import requests

from Common.utils import GetData
from Common.loader_interface import SourceDataLoader
Expand All @@ -10,7 +11,7 @@


##############
# Class: HGNC metabolites loader
# Class: HGNC loader
#
# By: Phil Owen
# Date: 3/31/2021
Expand All @@ -21,56 +22,47 @@ class HGNCLoader(SourceDataLoader):
source_id: str = HGNC
provenance_id: str = 'infores:hgnc'
description = "The HUGO Gene Nomenclature Committee (HGNC) database provides open access to HGNC-approved unique symbols and names for human genes, gene groups, and associated resources, including links to genomic, proteomic and phenotypic information."
source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/"
source_data_url = "https://www.genenames.org/download/archive/"
license = "https://www.genenames.org/about/"
attribution = "https://www.genenames.org/about/"
parsing_version: str = '1.2'
parsing_version: str = '1.3'

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
:param test_mode - sets the run into test mode
:param source_data_dir - the specific storage directory to save files in
"""
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.source_db = 'HUGO Gene Nomenclature Committee'
self.complete_set_file_name = 'hgnc_complete_set.txt'
# self.gene_groups_file_name = 'hgnc_genes_in_groups.txt'
self.data_files: list = [self.complete_set_file_name,
# self.gene_groups_file_name
]
self.test_mode: bool = test_mode
self.source_db: str = 'HUGO Gene Nomenclature Committee'

self.ftp_site = 'ftp.ebi.ac.uk'
self.ftp_dir = '/pub/databases/genenames/hgnc/tsv/'
self.data_file = self.complete_set_file_name
self.data_url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/"

def get_latest_source_version(self) -> str:
"""
gets the version of the data
:return: the data version
"""
data_puller = GetData()
# HGNC files change very frequently, excluding the day makes sure we only update it once per month
data_file_date = data_puller.get_ftp_file_date(self.ftp_site,
self.ftp_dir,
self.data_files[0],
exclude_day=True)
return data_file_date
headers = {"Accept": "application/json"}
info_response = requests.get('https://www.genenames.org/rest/info', headers=headers)
if info_response.ok:
info_json = info_response.json()
modified_date = info_json['lastModified']
latest_version = modified_date.split('T')[0]
return latest_version
else:
info_response.raise_for_status()

def get_data(self) -> int:
"""
Gets the HGNC data from two sources.
"""
# get a reference to the data gathering class
gd: GetData = GetData(self.logger.level)
file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.complete_set_file_name], self.data_path)

# get the gene groups dataset
# byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.self.gene_groups_file_name, self.data_path)

return file_count
gd: GetData = GetData()
data_file_url = self.data_url + self.data_file
gd.pull_via_http(url=data_file_url, data_dir=self.data_path)
return True

def parse_data(self) -> dict:
"""
Expand Down

0 comments on commit 6b8f389

Please sign in to comment.