expand custom ruleset context configs and add secTag configs

OHNLP · Aug 14, 2023 · 3fcf463 · 3fcf463
1 parent 87a5c6d
commit 3fcf463
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 25 deletions.
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>org.ohnlp.medtagger</groupId>
     <artifactId>medtagger</artifactId>
-    <version>1.0.62</version>
+    <version>1.0.63</version>
     <description>The MedTagger biomedical information extraction pipeline</description>
 
 

diff --git a/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java b/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java
@@ -12,6 +12,7 @@
 import org.apache.uima.cas.CASException;
 import org.apache.uima.fit.factory.AggregateBuilder;
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
 import org.apache.uima.fit.internal.ResourceManagerFactory;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
@@ -34,9 +35,7 @@
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.nio.file.FileSystem;
-import java.nio.file.FileSystemAlreadyExistsException;
-import java.nio.file.FileSystems;
+import java.nio.file.*;
 import java.text.SimpleDateFormat;
 import java.util.*;
 import java.util.concurrent.*;
@@ -80,6 +79,20 @@ public class MedTaggerBackboneTransform extends OneToOneTransform {
             required = false
     )
     private InputColumn noteIdField = NOTE_ID_COLUMN_DEF;
+    @ConfigurationProperty(
+            path = "ruleset_context",
+            desc = "Path to context definition file (contextRule.txt) relative to the resources folder, or \"DEFAULT\" " +
+                    "to use Ruleset-Supplied if present, otherwise Global Defaults. " ,
+            required = false
+    )
+    private String customContext = "DEFAULT";
+    @ConfigurationProperty(
+            path = "sectag",
+            desc = "Path to section tagging definition relative to the resources folder. " +
+                    "Can also use \"DEFAULT\" for SecTag defaults" ,
+            required = false
+    )
+    private String secTag = "DEFAULT";
     private Schema outputSchema;
     private boolean outputJSON;
 
@@ -123,7 +136,7 @@ public Schema calculateOutputSchema(Schema schema) {
     @Override
     public PCollection<Row> expand(PCollection<Row> input) {
         return input.apply("MedTagger Concept Extraction",
-                ParDo.of(new MedTaggerPipelineFunction(this.inputField.getSourceColumnName(), this.resources, this.mode, this.noteIdField.getSourceColumnName(), this.outputSchema)));
+                ParDo.of(new MedTaggerPipelineFunction(this.inputField.getSourceColumnName(), this.resources, this.mode, this.noteIdField.getSourceColumnName(), this.customContext, this.secTag, this.outputSchema)));
     }
 
     private static class MedTaggerPipelineFunction extends DoFn<Row, Row> {
@@ -134,17 +147,21 @@ private static class MedTaggerPipelineFunction extends DoFn<Row, Row> {
         private final RunMode mode;
         private final String noteIdField;
         private final Schema outputSchema;
+        private final String context;
+        private final String secTag;
 
         // UIMA components are not serializable, and thus must be initialized per-executor via the @Setup annotation
         private transient AnalysisEngine aae;
         private transient ResourceManager resMgr;
         private transient CAS cas;
 
-        public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMode mode, String noteIdField, Schema outputSchema) {
+        public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMode mode, String noteIdField, String context, String secTag, Schema outputSchema) {
             this.textField = textField;
             this.resourceFolder = resourceFolder;
             this.mode = mode;
             this.noteIdField = noteIdField;
+            this.context = context;
+            this.secTag = secTag;
             this.outputSchema = outputSchema;
         }
 
@@ -154,7 +171,22 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException,
                 INIT_MUTEX_LOCK.lock();
                 AggregateBuilder ae = new AggregateBuilder();
                 // Tokenization, Sentence Splitting, Section Detection, etc.
-                ae.add(createEngineDescription("desc.backbone.aes.PreConceptExtractionAE"));
+                if (this.secTag.equalsIgnoreCase("DEFAULT")) {
+                    ae.add(createEngineDescription("desc.backbone.aes.PreConceptExtractionAE"));
+                } else {
+                    URI secTag = MedTaggerPipelineFunction.class.getResource("/resources/" + this.secTag).toURI();
+                    Map<String, String> env = new HashMap<>();
+                    env.put("create", "true");
+                    try {
+                        // Ensure it is created, ignore if not
+                        FileSystem fs = FileSystems.newFileSystem(secTag, env);
+                    } catch (FileSystemAlreadyExistsException ignored) {
+                    }
+                    ae.add(createEngineDescription("desc.backbone.aes.PreConceptExtractionAE",
+                            "section_map",
+                            secTag.toString()
+                            ));
+                }
                 // Add the appropriate NER/normalization component depending on run mode
                 URI uri = null;
                 switch (mode) {
@@ -213,10 +245,42 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException,
                 }
 
                 // Add Context handling
-                if (uri != null && mode != RunMode.STANDALONE_DICT_ONLY) {
-                    ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", uri.toString()));
+                if (uri != null && !mode.equals(RunMode.STANDALONE_DICT_ONLY) && !mode.equals(RunMode.GENERAL_CLINICAL)) {
+                    if (this.context.equalsIgnoreCase("DEFAULT")) {
+                        if (Files.exists(Paths.get(uri).resolve("context").resolve("contextRule.txt"))) {
+                            ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", uri.toString()));
+                        } else {
+                            ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
+                        }
+                    } else {
+                        URI contextURI = MedTaggerPipelineFunction.class.getResource("/resources/" + context).toURI();
+                        Map<String, String> env = new HashMap<>();
+                        env.put("create", "true");
+                        try {
+                            // Ensure it is created, ignore if not
+                            FileSystem fs = FileSystems.newFileSystem(contextURI, env);
+                        } catch (FileSystemAlreadyExistsException ignored) {
+                        }
+                        ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", contextURI.toString()));
+                    }
                 } else {
-                    ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
+                    if (mode.equals(RunMode.STANDALONE_DICT_ONLY) || mode.equals(RunMode.STANDALONE_DICT_ONLY)) {
+                        if (this.context.equalsIgnoreCase("DEFAULT")) {
+                            ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
+                        } else {
+                            URI contextURI = MedTaggerPipelineFunction.class.getResource("/resources/" + context).toURI();
+                            Map<String, String> env = new HashMap<>();
+                            env.put("create", "true");
+                            try {
+                                // Ensure it is created, ignore if not
+                                FileSystem fs = FileSystems.newFileSystem(contextURI, env);
+                            } catch (FileSystemAlreadyExistsException ignored) {
+                            }
+                            ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", contextURI.toString()));
+                        }
+                    } else {
+                        ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
+                    }
                 }
 
                 this.resMgr = ResourceManagerFactory.newResourceManager();

diff --git a/src/main/java/org/ohnlp/medtagger/context/RuleContextAnnotator.java b/src/main/java/org/ohnlp/medtagger/context/RuleContextAnnotator.java
@@ -77,7 +77,13 @@ public void initialize(UimaContext ctxt) throws ResourceInitializationException
             if (ruleset == null) {
                 is = ConTexTSettings.class.getResourceAsStream("/medtaggerresources/context/contextRule.txt");
             } else {
-                is = Files.newInputStream(Paths.get(URI.create(ruleset)).resolve("context").resolve("contextRule.txt"));
+                if (ruleset.endsWith("contextRule.txt")) {
+                    // is an explicit file mention
+                    is = Files.newInputStream(Paths.get(URI.create(ruleset)));
+                } else {
+                    // is a ruleset dir
+                    is = Files.newInputStream(Paths.get(URI.create(ruleset)).resolve("context").resolve("contextRule.txt"));
+                }
             }
             contextSettings = new LinkedList<>();
             for (int priority : RULE_PRIORITIES) {

diff --git a/src/main/java/org/ohnlp/medtagger/sectag/RulebasedSectionAnnotator.java b/src/main/java/org/ohnlp/medtagger/sectag/RulebasedSectionAnnotator.java
@@ -24,14 +24,12 @@
 
 package org.ohnlp.medtagger.sectag;
 
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
+import java.io.*;
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.*;
 
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
@@ -43,6 +41,7 @@
 
 //import org.ohnlp.medtagger.type.ConceptMention;
 
+import org.ohnlp.medtagger.context.impl.ConTexTSettings;
 import org.ohnlp.typesystem.type.textspan.Segment;
 import org.ohnlp.typesystem.type.textspan.Sentence;
 import org.ohnlp.medtagger.lvg.LvgLookup;
@@ -171,10 +170,34 @@ public void initialize(UimaContext aContext)
 		sectionMap = new HashMap<String, String>();
 		try {
 			lvg  = new LvgLookup(aContext);
-
-			BufferedReader br = new BufferedReader(
-					new InputStreamReader(aContext.getResourceAsStream("section_map"),
-							"UTF-8"));
+
+			String ruleset = (String) aContext.getConfigParameterValue("sectag_ruleset");
+			InputStream is  = null;
+			try {
+				if (ruleset == null) {
+					is = ConTexTSettings.class.getResourceAsStream("/medtaggerresources/sectag/SecTag.section.txt");
+				} else {
+					if (ruleset.endsWith(".txt")) {
+						// is an explicit file mention
+						is = Files.newInputStream(Paths.get(URI.create(ruleset)));
+					} else {
+						// is a ruleset dir
+						is = Files.newInputStream(Paths.get(URI.create(ruleset)).resolve("Sections.txt"));
+					}
+				}
+			} catch (IOException e) {
+				throw new ResourceInitializationException(e);
+			}
+			BufferedReader br;
+			if (is == null) {
+				// Just in case/legacy
+				br = new BufferedReader(
+						new InputStreamReader(aContext.getResourceAsStream("section_map"),
+								"UTF-8"));
+			} else {
+				br = new BufferedReader(
+						new InputStreamReader(is, StandardCharsets.UTF_8));
+			}
 
 			while(br.ready()){
 				String str = br.readLine();

diff --git a/src/main/resources/desc/backbone/aes/MedTaggerDictionaryLookupAE.xml b/src/main/resources/desc/backbone/aes/MedTaggerDictionaryLookupAE.xml
@@ -76,7 +76,7 @@
                 <name>lvg_dict</name>
                 <description/>
                 <fileResourceSpecifier>
-                    <fileUrl>file:medtaggerresources/lvg/LRAGR_2011AB</fileUrl>
+                    <fileUrl>file:medtaggerresources/lvg/LRAGR_2021AB</fileUrl>
                 </fileResourceSpecifier>
             </externalResource>
 

diff --git a/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml b/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml
@@ -104,7 +104,7 @@
                 <name>section_map</name>
                 <description/>
                 <fileResourceSpecifier>
-                    <fileUrl>file:medtaggerieresources/pad/Radiology_Section</fileUrl>
+                    <fileUrl>file:medtaggerresources/sectag/SecTag.section.txt</fileUrl>
                 </fileResourceSpecifier>
             </externalResource>