diff --git a/pom.xml b/pom.xml
index c783bac..7f460b9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
org.ohnlp.medtagger
medtagger
- 1.0.71
+ 1.0.72
The MedTagger biomedical information extraction pipeline
diff --git a/src/main/java/org/ohnlp/medtagger/backbone/CleanMedTaggerDictOutputTransform.java b/src/main/java/org/ohnlp/medtagger/backbone/CleanMedTaggerDictOutputTransform.java
new file mode 100644
index 0000000..1695a86
--- /dev/null
+++ b/src/main/java/org/ohnlp/medtagger/backbone/CleanMedTaggerDictOutputTransform.java
@@ -0,0 +1,70 @@
+package org.ohnlp.medtagger.backbone;
+
+import org.apache.beam.sdk.coders.BigEndianLongCoder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.RowCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.transforms.Select;
+import org.apache.beam.sdk.transforms.*;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+import org.checkerframework.checker.initialization.qual.Initialized;
+import org.checkerframework.checker.nullness.qual.NonNull;
+import org.checkerframework.checker.nullness.qual.UnknownKeyFor;
+import org.joda.time.Duration;
+import org.ohnlp.backbone.api.annotations.ComponentDescription;
+import org.ohnlp.backbone.api.components.OneToOneTransform;
+import org.ohnlp.backbone.api.exceptions.ComponentInitializationException;
+import org.ohnlp.medtagger.lvg.LvgLookup;
+
+@ComponentDescription(
+ name = "Get Dict Freqs",
+ desc = "Gets Frequency of Dictionary Terms (Useful for Cleaning Noise from Autogenerated Dictionary Entries)"
+)
+public class CleanMedTaggerDictOutputTransform extends OneToOneTransform {
+ private final Schema schema = Schema.of(
+ Schema.Field.of("matched_text", Schema.FieldType.STRING),
+ Schema.Field.of("freq", Schema.FieldType.INT64)
+ );
+
+ @Override
+ public Schema calculateOutputSchema(Schema schema) {
+ return this.schema;
+ }
+
+ @Override
+ public PCollection expand(PCollection input) {
+ return input.apply(Select.fieldNames("matched_text", "note_source_value")).apply(ParDo.of(
+ new DoFn>() {
+ private LvgLookup lvg;
+ @ProcessElement
+ public void process(ProcessContext pc) {
+ Row input = pc.element();
+ String text = input.getString("matched_text");
+ text = lvg.getNorm(text).replaceAll("\\s", "\t");
+ pc.output(KV.of(Row.withSchema(schema).addValues(text, 1L).build(), input.getString("note_source_value")));
+ }
+ @Setup
+ public void init() {
+ this.lvg = new LvgLookup();
+ lvg.localInitialize(CleanMedTaggerDictOutputTransform.class.getResourceAsStream("/medtaggerresources/lvg/LRAGR_2021AB"), CleanMedTaggerDictOutputTransform.class.getResourceAsStream("/medtaggerresources/lvg/openclasswords.txt"));
+ }
+ }
+ )).setCoder(KvCoder.of(RowCoder.of(this.schema), StringUtf8Coder.of())
+ ).apply(Distinct.create()
+ ).apply(Count.perKey()
+ ).setCoder(KvCoder.of(RowCoder.of(this.schema), BigEndianLongCoder.of())
+ ).apply(MapElements.via(new SimpleFunction, Row>() {
+ @Override
+ public Row apply(KV input) {
+ return Row.withSchema(schema).addValues(input.getKey().getValue("matched_text"), input.getValue()).build();
+ }
+ }));
+ }
+
+ @Override
+ public void init() throws ComponentInitializationException {
+ }
+}
diff --git a/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java b/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java
index 1c145d1..ed6ebad 100644
--- a/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java
+++ b/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java
@@ -140,7 +140,7 @@ public PCollection expand(PCollection input) {
}
private static class MedTaggerPipelineFunction extends DoFn {
- private transient static final ReentrantLock INIT_MUTEX_LOCK = new ReentrantLock();
+// private transient static final ReentrantLock INIT_MUTEX_LOCK = new ReentrantLock();
private final String resourceFolder;
private final String textField;
@@ -168,7 +168,6 @@ public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMod
@Setup
public void init() throws IOException, InvalidXMLException, URISyntaxException, ResourceInitializationException {
try {
- INIT_MUTEX_LOCK.lock();
AggregateBuilder ae = new AggregateBuilder();
// Tokenization, Sentence Splitting, Section Detection, etc.
if (this.secTag.equalsIgnoreCase("DEFAULT")) {
@@ -288,7 +287,6 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException,
this.cas = CasCreationUtils.createCas(Collections.singletonList(aae.getMetaData()),
null, resMgr);
} finally {
- INIT_MUTEX_LOCK.unlock();
}
}
diff --git a/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml b/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml
index 40b4356..5dce935 100644
--- a/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml
+++ b/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml
@@ -37,7 +37,6 @@
SentenceDetectorAE
TokenizerAE
- ChunkerAE
POSTaggerAE
LineSentenceDetectorAE
LvgLookupAE