-
Notifications
You must be signed in to change notification settings - Fork 118
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Msigdb API, now mouse symbol/entrze could be download easy #204
- Loading branch information
zqfang
committed
Aug 7, 2023
1 parent
d20deeb
commit 2e5d15c
Showing
1 changed file
with
76 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import re | ||
|
||
import pandas as pd | ||
import requests | ||
|
||
class Msigdb: | ||
def __init__(self, dbver : str ="2023.1.Hs"): | ||
""" | ||
dbver: MSIGDB version number. default: 2023.1.Hs | ||
""" | ||
self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/" | ||
self._db_version = self._get_db_version() | ||
self.categoires = self.list_category(dbver) | ||
self._pattern = re.compile("(\w.+)\.(v\d.+)\.(entrez|symbols)\.gmt") | ||
|
||
|
||
def _get_db_version(self): | ||
resp = requests.get(self.url) | ||
if resp.ok: | ||
d = pd.read_html(resp.text)[0] | ||
# remove item : parent dictory and NA columns | ||
d = d.dropna(how="all").iloc[1:, 1:3].reset_index(drop=True) | ||
d.iloc[:, 0] = d.iloc[:, 0].str.rstrip("/") | ||
return d | ||
return None | ||
|
||
def get_gmt( | ||
self, | ||
category: str = "h.all", | ||
dbver: str = "7.5.1", | ||
entrez: bool = False | ||
): | ||
""" | ||
example: "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2023.1.Hs/c2.cp.kegg.v2023.1.Hs.entrez.gmt" | ||
""" | ||
identifier = "symbols" | ||
if entrez: | ||
identifier = "entrez" | ||
url = f"{self.url}/{dbver}/{category}.v{dbver}.{identifier}.gmt" | ||
resp = requests.get(url) | ||
if resp.ok: | ||
d = {} | ||
for line in resp.text.strip().split("\n"): | ||
row = line.split("\t") | ||
d[row[0]] = row[2:] | ||
return d | ||
return None | ||
|
||
def list_dbver(self): | ||
# self._db_version.columns = ["dbver", "date"] | ||
return self._db_version | ||
|
||
def list_category(self, dbver): | ||
""" | ||
dbver: MSIGDB version number. default: 2023.1.Hs | ||
""" | ||
d = self.list_gmt(dbver) | ||
if d is not None: | ||
categories = ( | ||
d.iloc[:, 0] | ||
.apply(lambda s: self._pattern.match(s).groups()[0]) | ||
.drop_duplicates() | ||
) | ||
return categories.to_list() | ||
return None | ||
|
||
def list_gmt(self, db): | ||
url = self.url + db | ||
resp = requests.get(url) | ||
if resp.ok: | ||
d = pd.read_html(resp.text)[0] | ||
# remove item : parent dictory and NA columns | ||
d = d.dropna(how="all").iloc[1:, 1:4] | ||
d = d[d.iloc[:, 0].str.match(self._pattern)] | ||
return d.reset_index(drop=True) | ||
return None |