Skip to content

Commit

Permalink
expand custom ruleset context configs and add secTag configs
Browse files Browse the repository at this point in the history
  • Loading branch information
qqndrew committed Aug 14, 2023
1 parent 87a5c6d commit 3fcf463
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 25 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.ohnlp.medtagger</groupId>
<artifactId>medtagger</artifactId>
<version>1.0.62</version>
<version>1.0.63</version>
<description>The MedTagger biomedical information extraction pipeline</description>


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.uima.cas.CASException;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.fit.internal.ResourceManagerFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
Expand All @@ -34,9 +35,7 @@
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.FileSystem;
import java.nio.file.FileSystemAlreadyExistsException;
import java.nio.file.FileSystems;
import java.nio.file.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.*;
Expand Down Expand Up @@ -80,6 +79,20 @@ public class MedTaggerBackboneTransform extends OneToOneTransform {
required = false
)
private InputColumn noteIdField = NOTE_ID_COLUMN_DEF;
@ConfigurationProperty(
path = "ruleset_context",
desc = "Path to context definition file (contextRule.txt) relative to the resources folder, or \"DEFAULT\" " +
"to use Ruleset-Supplied if present, otherwise Global Defaults. " ,
required = false
)
private String customContext = "DEFAULT";
@ConfigurationProperty(
path = "sectag",
desc = "Path to section tagging definition relative to the resources folder. " +
"Can also use \"DEFAULT\" for SecTag defaults" ,
required = false
)
private String secTag = "DEFAULT";
private Schema outputSchema;
private boolean outputJSON;

Expand Down Expand Up @@ -123,7 +136,7 @@ public Schema calculateOutputSchema(Schema schema) {
@Override
public PCollection<Row> expand(PCollection<Row> input) {
return input.apply("MedTagger Concept Extraction",
ParDo.of(new MedTaggerPipelineFunction(this.inputField.getSourceColumnName(), this.resources, this.mode, this.noteIdField.getSourceColumnName(), this.outputSchema)));
ParDo.of(new MedTaggerPipelineFunction(this.inputField.getSourceColumnName(), this.resources, this.mode, this.noteIdField.getSourceColumnName(), this.customContext, this.secTag, this.outputSchema)));
}

private static class MedTaggerPipelineFunction extends DoFn<Row, Row> {
Expand All @@ -134,17 +147,21 @@ private static class MedTaggerPipelineFunction extends DoFn<Row, Row> {
private final RunMode mode;
private final String noteIdField;
private final Schema outputSchema;
private final String context;
private final String secTag;

// UIMA components are not serializable, and thus must be initialized per-executor via the @Setup annotation
private transient AnalysisEngine aae;
private transient ResourceManager resMgr;
private transient CAS cas;

public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMode mode, String noteIdField, Schema outputSchema) {
public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMode mode, String noteIdField, String context, String secTag, Schema outputSchema) {
this.textField = textField;
this.resourceFolder = resourceFolder;
this.mode = mode;
this.noteIdField = noteIdField;
this.context = context;
this.secTag = secTag;
this.outputSchema = outputSchema;
}

Expand All @@ -154,7 +171,22 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException,
INIT_MUTEX_LOCK.lock();
AggregateBuilder ae = new AggregateBuilder();
// Tokenization, Sentence Splitting, Section Detection, etc.
ae.add(createEngineDescription("desc.backbone.aes.PreConceptExtractionAE"));
if (this.secTag.equalsIgnoreCase("DEFAULT")) {
ae.add(createEngineDescription("desc.backbone.aes.PreConceptExtractionAE"));
} else {
URI secTag = MedTaggerPipelineFunction.class.getResource("/resources/" + this.secTag).toURI();
Map<String, String> env = new HashMap<>();
env.put("create", "true");
try {
// Ensure it is created, ignore if not
FileSystem fs = FileSystems.newFileSystem(secTag, env);
} catch (FileSystemAlreadyExistsException ignored) {
}
ae.add(createEngineDescription("desc.backbone.aes.PreConceptExtractionAE",
"section_map",
secTag.toString()
));
}
// Add the appropriate NER/normalization component depending on run mode
URI uri = null;
switch (mode) {
Expand Down Expand Up @@ -213,10 +245,42 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException,
}

// Add Context handling
if (uri != null && mode != RunMode.STANDALONE_DICT_ONLY) {
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", uri.toString()));
if (uri != null && !mode.equals(RunMode.STANDALONE_DICT_ONLY) && !mode.equals(RunMode.GENERAL_CLINICAL)) {
if (this.context.equalsIgnoreCase("DEFAULT")) {
if (Files.exists(Paths.get(uri).resolve("context").resolve("contextRule.txt"))) {
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", uri.toString()));
} else {
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
}
} else {
URI contextURI = MedTaggerPipelineFunction.class.getResource("/resources/" + context).toURI();
Map<String, String> env = new HashMap<>();
env.put("create", "true");
try {
// Ensure it is created, ignore if not
FileSystem fs = FileSystems.newFileSystem(contextURI, env);
} catch (FileSystemAlreadyExistsException ignored) {
}
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", contextURI.toString()));
}
} else {
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
if (mode.equals(RunMode.STANDALONE_DICT_ONLY) || mode.equals(RunMode.STANDALONE_DICT_ONLY)) {
if (this.context.equalsIgnoreCase("DEFAULT")) {
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
} else {
URI contextURI = MedTaggerPipelineFunction.class.getResource("/resources/" + context).toURI();
Map<String, String> env = new HashMap<>();
env.put("create", "true");
try {
// Ensure it is created, ignore if not
FileSystem fs = FileSystems.newFileSystem(contextURI, env);
} catch (FileSystemAlreadyExistsException ignored) {
}
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class, "context_ruleset", contextURI.toString()));
}
} else {
ae.add(AnalysisEngineFactory.createEngineDescription(RuleContextAnnotator.class));
}
}

this.resMgr = ResourceManagerFactory.newResourceManager();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,13 @@ public void initialize(UimaContext ctxt) throws ResourceInitializationException
if (ruleset == null) {
is = ConTexTSettings.class.getResourceAsStream("/medtaggerresources/context/contextRule.txt");
} else {
is = Files.newInputStream(Paths.get(URI.create(ruleset)).resolve("context").resolve("contextRule.txt"));
if (ruleset.endsWith("contextRule.txt")) {
// is an explicit file mention
is = Files.newInputStream(Paths.get(URI.create(ruleset)));
} else {
// is a ruleset dir
is = Files.newInputStream(Paths.get(URI.create(ruleset)).resolve("context").resolve("contextRule.txt"));
}
}
contextSettings = new LinkedList<>();
for (int priority : RULE_PRIORITIES) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,12 @@

package org.ohnlp.medtagger.sectag;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.io.*;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
Expand All @@ -43,6 +41,7 @@

//import org.ohnlp.medtagger.type.ConceptMention;

import org.ohnlp.medtagger.context.impl.ConTexTSettings;
import org.ohnlp.typesystem.type.textspan.Segment;
import org.ohnlp.typesystem.type.textspan.Sentence;
import org.ohnlp.medtagger.lvg.LvgLookup;
Expand Down Expand Up @@ -171,10 +170,34 @@ public void initialize(UimaContext aContext)
sectionMap = new HashMap<String, String>();
try {
lvg = new LvgLookup(aContext);

BufferedReader br = new BufferedReader(
new InputStreamReader(aContext.getResourceAsStream("section_map"),
"UTF-8"));

String ruleset = (String) aContext.getConfigParameterValue("sectag_ruleset");
InputStream is = null;
try {
if (ruleset == null) {
is = ConTexTSettings.class.getResourceAsStream("/medtaggerresources/sectag/SecTag.section.txt");
} else {
if (ruleset.endsWith(".txt")) {
// is an explicit file mention
is = Files.newInputStream(Paths.get(URI.create(ruleset)));
} else {
// is a ruleset dir
is = Files.newInputStream(Paths.get(URI.create(ruleset)).resolve("Sections.txt"));
}
}
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
BufferedReader br;
if (is == null) {
// Just in case/legacy
br = new BufferedReader(
new InputStreamReader(aContext.getResourceAsStream("section_map"),
"UTF-8"));
} else {
br = new BufferedReader(
new InputStreamReader(is, StandardCharsets.UTF_8));
}

while(br.ready()){
String str = br.readLine();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
<name>lvg_dict</name>
<description/>
<fileResourceSpecifier>
<fileUrl>file:medtaggerresources/lvg/LRAGR_2011AB</fileUrl>
<fileUrl>file:medtaggerresources/lvg/LRAGR_2021AB</fileUrl>
</fileResourceSpecifier>
</externalResource>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
<name>section_map</name>
<description/>
<fileResourceSpecifier>
<fileUrl>file:medtaggerieresources/pad/Radiology_Section</fileUrl>
<fileUrl>file:medtaggerresources/sectag/SecTag.section.txt</fileUrl>
</fileResourceSpecifier>
</externalResource>

Expand Down

0 comments on commit 3fcf463

Please sign in to comment.