forked from s-andrews/sradownloader
-
Notifications
You must be signed in to change notification settings - Fork 1
/
SRAmeta2config.py
executable file
·65 lines (54 loc) · 2.42 KB
/
SRAmeta2config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
import pandas as pd
import requests
import re, argparse
usage = "Converts SRA metafile into excel with additional data. The Excel file can be used as input for SRA_download.py"
parser = argparse.ArgumentParser(usage=usage, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
files = parser.add_argument_group('Options for input files')
files.add_argument("-f", dest="csv_file", help="CSV file name or path",
required=True, type=str)
files.add_argument("-o", dest="out_file", help="output file name or path. If empty the suffix is replaced to .xlsx",default=None)
args = parser.parse_args()
f_name=args.csv_file
if args.out_file:
out_file = args.out_file
else:
out_file = f_name.split(".")[0]+".xlsx"
def get_geo_name(srr_number,splitName=True):
sample_name = ""
response = requests.get(f"https://www.ncbi.nlm.nih.gov/sra/?term={srr_number}&format=text")
for line in response.text.splitlines():
# line = line.decode("UTF-8")
if line.startswith("Title:"):
line = line.strip()
geosections = re.split("[:; ,]+",line)
sample_name = "_".join(geosections[1:])
break
if splitName==True:
sample_name = sample_name.split("_")[1:-3]
return "_".join(sample_name)
else:
return sample_name
df_input = pd.read_csv(f_name,sep=',')
df_input['name_full'] = ""
df_input['exp_type'] = ""
df_input['author'] = ""
df_input['year'] = ""
df_input['month'] = ""
df_input['day'] = ""
df_input['bait'] = ""
df_input['background'] = ""
df_input['condition'] = ""
df_input['replicate'] = ""
df_input['media'] = df_input['Run'].apply(get_geo_name)
df_input['strandness'] = ""
df_input['barcode_id'] = ""
df_input['index_id'] = ""
df_input['index_id'] = ""
df_input['5adapter'] = ""
df_input['3adapter'] = ""
df_input['link'] = ""
df_input['strain'] = ""
df_final = df_input[['Run','name_full','exp_type',"SRA Study",'author','year','month','day','bait','background','condition','replicate','Organism','media','LibraryLayout',"AvgSpotLen","strandness","barcode_id",'index_id','index_id','5adapter','3adapter','link','strain']]
df_final.columns = ["SRR","name_full",'exp_type','exp_id','author','year','month','day','bait','background','condition','replicate','organism','media',"SEorPE","read_length","strandness","barcode_id",'index_id','index_id','5adapter','3adapter','link','strain']
df_final.to_excel(out_file,index=False)