Restructured 01_process.py to create an HDT file

biobricks-ai · Nov 28, 2024 · 1a95021 · 1a95021
1 parent 7e5bae9
commit 1a95021
Showing 1 changed file with 102 additions and 53 deletions.
diff --git a/stages/01_process.py b/stages/01_process.py
@@ -1,13 +1,23 @@
 import biobricks as bb
 import pandas as pd
 import json
-import pathlib 
+import pathlib
+import shutil
 from tqdm import tqdm
+from rdflib import Graph, Literal, Namespace, RDF, URIRef
+import subprocess
 
 tqdm.pandas()
 
-outdir = pathlib.Path('cache/process')
+# cachedir for ttl and nt files, if needed
+cachedir = pathlib.Path('cache/process')
+cachedir.mkdir(parents=True, exist_ok=True)
+# remove unneeded files after (ttl, nt)
+
+# outdir should be brick (hdt file only)
+outdir = pathlib.Path('./brick')
 outdir.mkdir(parents=True, exist_ok=True)
+
 pa_brick = bb.assets('pubchem-annotations')
 
 # pa_brick has a single table `annotations_parquet`
@@ -19,69 +29,108 @@
 row0 = rawpa.iloc[0]
 print(json.dumps(row0.apply(str).to_dict(), indent=4))
 
-# - [x] create chemical
-# - [x] create annotation
-# - [x] create annotation has_subject chemical
-# - [x] create annotation has_value value
-#       extract this vaue from row.Data.Value.StringWithMarkup
-#       create a new triple for each string and the StringWithMarkup array
-#       check dcterms ontology for a good predicate to associate string with markup value to its annotation
-# - [x] remove filter to allow multiple pubchem CIDs for annotation
-
-annotations = []
-annotations.append(
-'''@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
-@prefix pccompound: <http://rdf.ncbi.nlm.nih.gov/pubchem/compound/> .
-@prefix pcannotation: <http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/> .
-@prefix oa: <http://www.w3.org/ns/oa#> .
-@prefix dc: <http://purl.org/dc/elements/1.1/>.
+# Create a new RDF graph
+g = Graph()
+
+# Define namespaces
+namespaces_sources = {
+    "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+    "pccompound" : "http://rdf.ncbi.nlm.nih.gov/pubchem/compound/",
+    "pcsubstance" : "http://rdf.ncbi.nlm.nih.gov/pubchem/substance/",
+    "pcannotation" : "http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/",
+    "oa" : "http://www.w3.org/ns/oa#",
+    "dc" : "http://purl.org/dc/elements/1.1/",
+}
+
+namespaces = {key: Namespace(val) for key, val in namespaces_sources.items()}
+
+# Bind namespaces
+for key, val in namespaces.items():
+    g.bind(key, val)
+
+''' VISUAL REPRESENTATION OF GRAPH COMPONENT
+*markup is short for string_with_markup
+
+                         +------------------+
+                 +-------|  annotation_iri  |
+                 |       +------------------+
+                 |             |      |      
+            RDF.type           |   OA.hasBody       
+                 |             |      |      
+                 v             |      |  +-------------------+
+       +-----------------+     |      +->|       body        |
+       |  OA.Annotation  |     |         +-------------------+
+       +-----------------+     |            |               |
+                               |         RDF.value      DC["format"]
+                               |            |               |
+                               |            v               v
+                               |  +-----------------+ +-----------------------+
+                               |  | Literal(markup) | | Literal("text/plain") |
+                               |  +-----------------+ +-----------------------+
+                               |
+                  OA.hasTarget / DC.subject
+                               |
+                  +---------+--o--------------+-------------+
+                  |         |                 |             |
+                  v         |                 v             |
+	  +-------------------+ |        +-------------------+  |
+	  |   compound_iri_1  | |  ...   |   compound_iri_m  |  | 
+	  +-------------------+ |        +-------------------+  |
+	                        v                               v
+				  +-------------------+          +-------------------+ 
+				  |  substance_iri_1  |   ...    |  substance_iri_n  |   
+				  +-------------------+          +-------------------+
 '''
-)
-with open(outdir / 'annotations.ttl', 'w') as f:
-    f.write(annotations[0])
 
 # loop through rawpa creating a chemical for each row
-for index, row in tqdm(rawpa.iterrows()):
+# convert to list? for ETC
+for index, row in tqdm(rawpa.iterrows(), total = len(rawpa), desc = "Processing rows"):
     cid = row['PubChemCID']
-    # chem_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID{cid}"    
-
-    # create an annotation
+    sid = row['PubChemSID']
     anid = row['ANID']
-    annotation_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/ANID{anid}"
+
+    # Create URIs
+    annotation_iri = URIRef(namespaces["pcannotation"] + f"ANID{anid}")
+    compound_iri = [URIRef(namespaces["pccompound"] + f"CID{c}") for c in cid]
+    substance_iri = [URIRef(namespaces["pcsubstance"] + f"CID{s}") for s in sid]
 
     # create the value for the annotation
     # # Parse the Data Field as JSON
     data = json.loads(row['Data'])
-    string_with_markup = data.get('Value', {}).get('StringWithMarkup', [{}])[0].get('String', '')
-    string_with_markup = string_with_markup.replace('\\', '\\\\')
-    string_with_markup = string_with_markup.replace('"', r'\"')
-
-    annotations.append(
-f'''
-pcannotation:ANID{anid}
-  a oa:Annotation ;
-'''
-    )
+    # # annotation may have multiple values
+    string_with_markup_list = [markup.get('String', '') for markup in data.get('Value', {}).get('StringWithMarkup', [])]
+
+    # add triples to the graph
+    g.add((annotation_iri, RDF.type, namespaces["oa"].Annotation))
 
     # add the CID to the annotation, skip if there are no CIDs
-    if len(cid) > 0:
-      for c in cid:
-        annotations[-1] += f'  oa:hasTarget pccompound:CID{c} ;\n'
-      for c in cid:
-        annotations[-1] += f'  dc:subject   pccompound:CID{c} ;\n'
+    for ci in compound_iri:
+        g.add((annotation_iri, namespaces["oa"].hasTarget, ci))
+        g.add((annotation_iri, namespaces["dc"].subject, ci))
+
+    # add SID to the annotation, skip if there are no SIDs
+    for si in substance_iri:
+        g.add((annotation_iri, namespaces["oa"].hasTarget, si))
+        g.add((annotation_iri, namespaces["dc"].subject, si))
 
+    body = URIRef(f"{annotation_iri}/body")
+    g.add((annotation_iri, namespaces["oa"].hasBody, body))
     # triple quotes used to allow multi-line strings
-    # space after {string_with_markup} ensures quotes not broken
-    annotations[-1] += \
-fr'''  oa:hasBody [
-  rdf:value """{string_with_markup} """ ;
-  dc:format "text/plain"
-] .
-'''
+    for swm in string_with_markup_list:
+        g.add((body, RDF.value, Literal(swm)))
+
+    g.add((body, namespaces["dc"]["format"], Literal("text/plain")))
+
+print("Creating HDT file ...")
+# Serialize the graph to a string in Turtle format
+turtle_file = str(cachedir / "temp_graph.ttl")
+g.serialize(destination=turtle_file, format='turtle')
 
-    # write the annotation to a turtle file
-    with open(outdir / 'annotations.ttl', 'a') as f:
-        f.write(annotations[-1])
-    
-    # add a has_annotation
+# Convert the Turtle file to an HDT file
+hdt_file = str(outdir / 'annotations.hdt')
+# # Conversion using the command-line tool rdf2hdt
+subprocess.run(["rdf2hdt", turtle_file, hdt_file], check=True)
+print(f"Done writing HDT file to {hdt_file}")
 
+# delete cache directory
+shutil.rmtree(pathlib.Path('cache'))