-
Notifications
You must be signed in to change notification settings - Fork 2
/
ChEBI_JSON_Builder.py
68 lines (61 loc) · 2.14 KB
/
ChEBI_JSON_Builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Usage:
{__file__} [options]
Options:
--input F Input SDF file from EBI. [default: ChEBI_complete.sdf.gz]
--split_count COUNT Number of lines per output file. [default: 5000]
--output_dir DIR Output directory to place JSON line files in. [default: chebi_split]
"""
import os
import mmap
import gzip
import json
import urllib
from docopt import magic
magic()
def typeChemical(d):
for key, value in d.items():
if len(value) == 1:
if key == "ChEBI ID":
d[key] = int(value[0].split(":")[-1])
elif key in ["Charge", "Star"]:
d[key] = int(value[0])
elif key in ["Mass", "Monoisotopic Mass"]:
d[key] = float(value[0])
elif key in ["ChEBI Name", "Definition"]:
d[key] = value[0]
if len(d):
return (d["ChEBI ID"], dict(sorted(d.items())))
entries = []
os.makedirs(arguments.output_dir, mode=0o755, exist_ok=True)
with open(arguments.input, "r") as f:
mapped = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
gzfile = gzip.GzipFile(mode="r", fileobj=mapped)
done = False
while True:
segments = []
line = b""
while line.strip() != b"$$$$":
line = gzfile.readline()
if line == b"":
done = True
break
segments.append(line)
segment = b''.join(segments).decode()
rebuilt = "\n> <Molfile>\n" + segment
subsegments = [subseg for subseg in rebuilt.split("\n> <")]
info = [subseg.split(">\n") for subseg in subsegments][1:-1]
chem = {i[0]:([x for x in i[1].split('\n') if x] if i[0] != "Molfile" else i[1]) for i in info}
entry = typeChemical(chem)
if entry:
entries.append(entry)
if done:
break
split_count = int(arguments.split_count or 5000)
split_index = 0
for entry in sorted(entries):
chebi_id, json_text = entry[0], json.dumps(entry[1])
if chebi_id > split_index:
f = open(f"{arguments.output_dir}/chebi-{split_index}+{split_count}.json", "w")
split_index += split_count
f.write(json_text.strip() + "\n")