Merge pull request #263 from RobokopU24/hgnc_fix

updating HGNC file location and version date for new HGNC set up
RobokopU24 · Oct 24, 2024 · 6b8f389 · 6b8f389
2 parents b5fcc32 + f604791
commit 6b8f389
Showing 1 changed file with 20 additions and 28 deletions.
diff --git a/parsers/hgnc/src/loadHGNC.py b/parsers/hgnc/src/loadHGNC.py
@@ -1,6 +1,7 @@
 import argparse
 import csv
 import os
+import requests
 
 from Common.utils import GetData
 from Common.loader_interface import SourceDataLoader
@@ -10,7 +11,7 @@
 
 
 ##############
-# Class: HGNC metabolites loader
+# Class: HGNC loader
 #
 # By: Phil Owen
 # Date: 3/31/2021
@@ -21,56 +22,47 @@ class HGNCLoader(SourceDataLoader):
     source_id: str = HGNC
     provenance_id: str = 'infores:hgnc'
     description = "The HUGO Gene Nomenclature Committee (HGNC) database provides open access to HGNC-approved unique symbols and names for human genes, gene groups, and associated resources, including links to genomic, proteomic and phenotypic information."
-    source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/"
+    source_data_url = "https://www.genenames.org/download/archive/"
     license = "https://www.genenames.org/about/"
     attribution = "https://www.genenames.org/about/"
-    parsing_version: str = '1.2'
+    parsing_version: str = '1.3'
 
     def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         """
         :param test_mode - sets the run into test mode
         :param source_data_dir - the specific storage directory to save files in
         """
         super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
-
+        self.source_db = 'HUGO Gene Nomenclature Committee'
         self.complete_set_file_name = 'hgnc_complete_set.txt'
-        # self.gene_groups_file_name = 'hgnc_genes_in_groups.txt'
-        self.data_files: list = [self.complete_set_file_name,
-                                 # self.gene_groups_file_name
-                                 ]
-        self.test_mode: bool = test_mode
-        self.source_db: str = 'HUGO Gene Nomenclature Committee'
-
-        self.ftp_site = 'ftp.ebi.ac.uk'
-        self.ftp_dir = '/pub/databases/genenames/hgnc/tsv/'
+        self.data_file = self.complete_set_file_name
+        self.data_url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/"
 
     def get_latest_source_version(self) -> str:
         """
         gets the version of the data
 
         :return: the data version
         """
-        data_puller = GetData()
-        # HGNC files change very frequently, excluding the day makes sure we only update it once per month
-        data_file_date = data_puller.get_ftp_file_date(self.ftp_site,
-                                                       self.ftp_dir,
-                                                       self.data_files[0],
-                                                       exclude_day=True)
-        return data_file_date
+        headers = {"Accept": "application/json"}
+        info_response = requests.get('https://www.genenames.org/rest/info', headers=headers)
+        if info_response.ok:
+            info_json = info_response.json()
+            modified_date = info_json['lastModified']
+            latest_version = modified_date.split('T')[0]
+            return latest_version
+        else:
+            info_response.raise_for_status()
 
     def get_data(self) -> int:
         """
         Gets the HGNC data from two sources.
 
         """
-        # get a reference to the data gathering class
-        gd: GetData = GetData(self.logger.level)
-        file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.complete_set_file_name], self.data_path)
-
-        # get the gene groups dataset
-        # byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.self.gene_groups_file_name, self.data_path)
-
-        return file_count
+        gd: GetData = GetData()
+        data_file_url = self.data_url + self.data_file
+        gd.pull_via_http(url=data_file_url, data_dir=self.data_path)
+        return True
 
     def parse_data(self) -> dict:
         """