Skip to content

Commit

Permalink
Restructured 01_process.py to create an HDT file
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmikhail committed Nov 28, 2024
1 parent 7e5bae9 commit 1a95021
Showing 1 changed file with 102 additions and 53 deletions.
155 changes: 102 additions & 53 deletions stages/01_process.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
import biobricks as bb
import pandas as pd
import json
import pathlib
import pathlib
import shutil
from tqdm import tqdm
from rdflib import Graph, Literal, Namespace, RDF, URIRef
import subprocess

tqdm.pandas()

outdir = pathlib.Path('cache/process')
# cachedir for ttl and nt files, if needed
cachedir = pathlib.Path('cache/process')
cachedir.mkdir(parents=True, exist_ok=True)
# remove unneeded files after (ttl, nt)

# outdir should be brick (hdt file only)
outdir = pathlib.Path('./brick')
outdir.mkdir(parents=True, exist_ok=True)

pa_brick = bb.assets('pubchem-annotations')

# pa_brick has a single table `annotations_parquet`
Expand All @@ -19,69 +29,108 @@
row0 = rawpa.iloc[0]
print(json.dumps(row0.apply(str).to_dict(), indent=4))

# - [x] create chemical
# - [x] create annotation
# - [x] create annotation has_subject chemical
# - [x] create annotation has_value value
# extract this vaue from row.Data.Value.StringWithMarkup
# create a new triple for each string and the StringWithMarkup array
# check dcterms ontology for a good predicate to associate string with markup value to its annotation
# - [x] remove filter to allow multiple pubchem CIDs for annotation

annotations = []
annotations.append(
'''@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix pccompound: <http://rdf.ncbi.nlm.nih.gov/pubchem/compound/> .
@prefix pcannotation: <http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/> .
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix dc: <http://purl.org/dc/elements/1.1/>.
# Create a new RDF graph
g = Graph()

# Define namespaces
namespaces_sources = {
"rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"pccompound" : "http://rdf.ncbi.nlm.nih.gov/pubchem/compound/",
"pcsubstance" : "http://rdf.ncbi.nlm.nih.gov/pubchem/substance/",
"pcannotation" : "http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/",
"oa" : "http://www.w3.org/ns/oa#",
"dc" : "http://purl.org/dc/elements/1.1/",
}

namespaces = {key: Namespace(val) for key, val in namespaces_sources.items()}

# Bind namespaces
for key, val in namespaces.items():
g.bind(key, val)

''' VISUAL REPRESENTATION OF GRAPH COMPONENT
*markup is short for string_with_markup
+------------------+
+-------| annotation_iri |
| +------------------+
| | |
RDF.type | OA.hasBody
| | |
v | | +-------------------+
+-----------------+ | +->| body |
| OA.Annotation | | +-------------------+
+-----------------+ | | |
| RDF.value DC["format"]
| | |
| v v
| +-----------------+ +-----------------------+
| | Literal(markup) | | Literal("text/plain") |
| +-----------------+ +-----------------------+
|
OA.hasTarget / DC.subject
|
+---------+--o--------------+-------------+
| | | |
v | v |
+-------------------+ | +-------------------+ |
| compound_iri_1 | | ... | compound_iri_m | |
+-------------------+ | +-------------------+ |
v v
+-------------------+ +-------------------+
| substance_iri_1 | ... | substance_iri_n |
+-------------------+ +-------------------+
'''
)
with open(outdir / 'annotations.ttl', 'w') as f:
f.write(annotations[0])

# loop through rawpa creating a chemical for each row
for index, row in tqdm(rawpa.iterrows()):
# convert to list? for ETC
for index, row in tqdm(rawpa.iterrows(), total = len(rawpa), desc = "Processing rows"):
cid = row['PubChemCID']
# chem_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID{cid}"

# create an annotation
sid = row['PubChemSID']
anid = row['ANID']
annotation_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/ANID{anid}"

# Create URIs
annotation_iri = URIRef(namespaces["pcannotation"] + f"ANID{anid}")
compound_iri = [URIRef(namespaces["pccompound"] + f"CID{c}") for c in cid]
substance_iri = [URIRef(namespaces["pcsubstance"] + f"CID{s}") for s in sid]

# create the value for the annotation
# # Parse the Data Field as JSON
data = json.loads(row['Data'])
string_with_markup = data.get('Value', {}).get('StringWithMarkup', [{}])[0].get('String', '')
string_with_markup = string_with_markup.replace('\\', '\\\\')
string_with_markup = string_with_markup.replace('"', r'\"')

annotations.append(
f'''
pcannotation:ANID{anid}
a oa:Annotation ;
'''
)
# # annotation may have multiple values
string_with_markup_list = [markup.get('String', '') for markup in data.get('Value', {}).get('StringWithMarkup', [])]

# add triples to the graph
g.add((annotation_iri, RDF.type, namespaces["oa"].Annotation))

# add the CID to the annotation, skip if there are no CIDs
if len(cid) > 0:
for c in cid:
annotations[-1] += f' oa:hasTarget pccompound:CID{c} ;\n'
for c in cid:
annotations[-1] += f' dc:subject pccompound:CID{c} ;\n'
for ci in compound_iri:
g.add((annotation_iri, namespaces["oa"].hasTarget, ci))
g.add((annotation_iri, namespaces["dc"].subject, ci))

# add SID to the annotation, skip if there are no SIDs
for si in substance_iri:
g.add((annotation_iri, namespaces["oa"].hasTarget, si))
g.add((annotation_iri, namespaces["dc"].subject, si))

body = URIRef(f"{annotation_iri}/body")
g.add((annotation_iri, namespaces["oa"].hasBody, body))
# triple quotes used to allow multi-line strings
# space after {string_with_markup} ensures quotes not broken
annotations[-1] += \
fr''' oa:hasBody [
rdf:value """{string_with_markup} """ ;
dc:format "text/plain"
] .
'''
for swm in string_with_markup_list:
g.add((body, RDF.value, Literal(swm)))

g.add((body, namespaces["dc"]["format"], Literal("text/plain")))

print("Creating HDT file ...")
# Serialize the graph to a string in Turtle format
turtle_file = str(cachedir / "temp_graph.ttl")
g.serialize(destination=turtle_file, format='turtle')

# write the annotation to a turtle file
with open(outdir / 'annotations.ttl', 'a') as f:
f.write(annotations[-1])
# add a has_annotation
# Convert the Turtle file to an HDT file
hdt_file = str(outdir / 'annotations.hdt')
# # Conversion using the command-line tool rdf2hdt
subprocess.run(["rdf2hdt", turtle_file, hdt_file], check=True)
print(f"Done writing HDT file to {hdt_file}")

# delete cache directory
shutil.rmtree(pathlib.Path('cache'))

0 comments on commit 1a95021

Please sign in to comment.