From 56fef85263ee328318fd92a51a1ea3cc248531be Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Sat, 1 Sep 2018 19:00:03 -0500 Subject: [PATCH 1/3] Configuration parameters, gazetteers and brown clusters are no long static singleton rather they are stored in the annotator (which is a singleton per model via the factory for NER annotators). In this way, we can have multiple NER models for different languages in the same process space. --- .../org/cogcomp/md/BIOCombinedReader.java | 9 +- .../main/java/org/cogcomp/md/BIOReader.java | 10 +- .../java/org/cogcomp/md/ExtentReader.java | 10 +- .../java/org/cogcomp/md/ExtentTester.java | 17 +- .../java/org/cogcomp/md/MentionAnnotator.java | 7 +- .../ner/ExpressiveFeatures/BrownClusters.java | 209 ++++++++++-------- .../ContextAggregation.java | 10 +- .../ExpressiveFeaturesAnnotator.java | 65 ++---- .../ExpressiveFeatures/FlatGazetteers.java | 10 - .../ner/ExpressiveFeatures/GazetteerTree.java | 62 +----- .../ExpressiveFeatures/GazetteersFactory.java | 34 +-- .../TitleTextNormalizer.java | 14 -- .../ExpressiveFeatures/TreeGazetteers.java | 37 +++- ...TwoLayerPredictionAggregationFeatures.java | 2 +- .../cogcomp/ner/InferenceMethods/Decoder.java | 26 +-- .../cs/cogcomp/ner/LbjTagger/Data.java | 16 +- .../LbjTagger/LearningCurveMultiDataset.java | 93 ++++---- .../ner/LbjTagger/NEDisplayPredictions.java | 10 +- .../cs/cogcomp/ner/LbjTagger/NETagPlain.java | 81 +++---- .../ner/LbjTagger/NETesterMultiDataset.java | 58 ++--- .../cs/cogcomp/ner/LbjTagger/NEWord.java | 11 +- .../cs/cogcomp/ner/LbjTagger/Parameters.java | 45 ++-- .../ner/LbjTagger/ParametersForLbjCode.java | 20 +- .../illinois/cs/cogcomp/ner/ModelLoader.java | 10 +- .../illinois/cs/cogcomp/ner/NERAnnotator.java | 29 +-- .../illinois/cs/cogcomp/ner/NerBenchmark.java | 50 ++--- .../illinois/cs/cogcomp/ner/NerTagger.java | 20 +- .../BracketFileReader.java | 38 ++-- .../BuildEvaluationFiles.java | 6 +- .../ColumnFileReader.java | 10 +- .../PlainTextReader.java | 53 +++-- .../TaggedDataReader.java | 18 +- ner/src/main/lbj/LbjTagger.lbj | 68 +++--- .../cs/cogcomp/ner/NERAnnotatorTest.java | 3 - .../edu/illinois/cs/cogcomp/ner/NerTest.java | 16 +- .../cogcomp/ner/reference/ReferenceUtils.java | 5 +- .../java/org/cogcomp/re/ACEMentionReader.java | 4 +- .../java/org/cogcomp/re/ExampleUsage.java | 7 +- .../cogcomp/re/PredictedMentionReader.java | 5 +- .../org/cogcomp/re/RelationAnnotator.java | 5 +- .../org/cogcomp/re/SemEvalMentionReader.java | 6 +- 41 files changed, 545 insertions(+), 664 deletions(-) diff --git a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java index 107f4c748..7dfed869c 100644 --- a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java @@ -7,6 +7,7 @@ */ package org.cogcomp.md; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -137,10 +138,12 @@ else if (mode.contains("ALL")){ private List getTokensFromTAs(){ List ret = new ArrayList<>(); WordNetManager wordNet = null; + Gazetteers gazetteers = null; + BrownClusters brownClusters = null; try { Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); Vector bcs = new Vector<>(); bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); bcs.add("brown-clusters/brownBllipClusters"); @@ -153,15 +156,13 @@ private List getTokensFromTAs(){ bcsl.add(false); bcsl.add(false); bcsl.add(false); - BrownClusters.init(bcs, bcst, bcsl, false); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); } catch (Exception e){ e.printStackTrace(); } - Gazetteers gazetteers = GazetteersFactory.get(); - BrownClusters brownClusters = BrownClusters.get(); for (TextAnnotation ta : currentTas){ View tokenView = ta.getView(ViewNames.TOKENS); String mentionViewName = ""; diff --git a/md/src/main/java/org/cogcomp/md/BIOReader.java b/md/src/main/java/org/cogcomp/md/BIOReader.java index 20c56617e..d6a88273b 100644 --- a/md/src/main/java/org/cogcomp/md/BIOReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOReader.java @@ -7,6 +7,7 @@ */ package org.cogcomp.md; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -139,10 +140,12 @@ private void annotateTas(){ private List getTokensFromTAs(){ List ret = new ArrayList<>(); WordNetManager wordNet = null; + Gazetteers gazetteers = null; + BrownClusters brownClusters = null; try { Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); Vector bcs = new Vector<>(); bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); bcs.add("brown-clusters" + File.separator + "brownBllipClusters"); @@ -155,15 +158,14 @@ private List getTokensFromTAs(){ bcsl.add(false); bcsl.add(false); bcsl.add(false); - BrownClusters.init(bcs, bcst, bcsl, false); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); } catch (Exception e){ e.printStackTrace(); } - Gazetteers gazetteers = GazetteersFactory.get(); - BrownClusters brownClusters = BrownClusters.get(); + String mentionViewName = ""; if (_mode.equals("ACE05")){ mentionViewName = ViewNames.MENTION_ACE; diff --git a/md/src/main/java/org/cogcomp/md/ExtentReader.java b/md/src/main/java/org/cogcomp/md/ExtentReader.java index a1f28b814..2372d9647 100644 --- a/md/src/main/java/org/cogcomp/md/ExtentReader.java +++ b/md/src/main/java/org/cogcomp/md/ExtentReader.java @@ -7,6 +7,7 @@ */ package org.cogcomp.md; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -117,12 +118,14 @@ public List getTextAnnotations(){ public List getPairs(){ List ret = new ArrayList<>(); WordNetManager wordNet = null; + Gazetteers gazetteers = null; + BrownClusters brownClusters = null; try { WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); Vector bcs = new Vector<>(); bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); bcs.add("brown-clusters/brownBllipClusters"); @@ -135,14 +138,11 @@ public List getPairs(){ bcsl.add(false); bcsl.add(false); bcsl.add(false); - BrownClusters.init(bcs, bcst, bcsl, false); - + brownClusters = BrownClusters.get(bcs, bcst, bcsl); } catch (Exception e){ e.printStackTrace(); } - Gazetteers gazetteers = GazetteersFactory.get(); - BrownClusters brownClusters = BrownClusters.get(); for (TextAnnotation ta : taList){ String mentionViewName = ViewNames.MENTION_ERE; if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")){ diff --git a/md/src/main/java/org/cogcomp/md/ExtentTester.java b/md/src/main/java/org/cogcomp/md/ExtentTester.java index b98e20296..9d93fef76 100644 --- a/md/src/main/java/org/cogcomp/md/ExtentTester.java +++ b/md/src/main/java/org/cogcomp/md/ExtentTester.java @@ -9,6 +9,7 @@ import org.cogcomp.md.LbjGen.*; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.Pair; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; @@ -208,13 +209,15 @@ public static void testExtentOnGoldHead(){ int correct = 0; POSAnnotator posAnnotator = null; WordNetManager wordNet = null; + Gazetteers gazetteers = null; + BrownClusters brownClusters = null; try{ WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); posAnnotator = new POSAnnotator(); Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); Vector bcs = new Vector<>(); bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); bcs.add("brown-clusters" + File.separator + "brownBllipClusters"); @@ -227,13 +230,11 @@ public static void testExtentOnGoldHead(){ bcsl.add(false); bcsl.add(false); bcsl.add(false); - BrownClusters.init(bcs, bcst, bcsl, false); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); } catch (Exception e){ e.printStackTrace(); } - Gazetteers gazetteers = GazetteersFactory.get(); - BrownClusters brownClusters = BrownClusters.get(); for (int i = 0; i < 1; i++) { ExtentReader train_parser = new ExtentReader("data/partition_with_dev/train/" + i, "COMBINED-ALL-TRAIN-" + i); extent_classifier classifier = train_extent_classifier(train_parser); @@ -288,12 +289,14 @@ public static Constituent getPredictedMentionHead(Constituent c){ public static void testExtentOnPredictedHead(){ WordNetManager wordNet = null; + Gazetteers gazetteers = null; + BrownClusters brownClusters = null; try{ WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); Vector bcs = new Vector<>(); bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); bcs.add("brown-clusters" + File.separator + "brownBllipClusters"); @@ -306,13 +309,11 @@ public static void testExtentOnPredictedHead(){ bcsl.add(false); bcsl.add(false); bcsl.add(false); - BrownClusters.init(bcs, bcst, bcsl, false); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); } catch (Exception e){ e.printStackTrace(); } - Gazetteers gazetteers = GazetteersFactory.get(); - BrownClusters brownClusters = BrownClusters.get(); int total_mention_predicted = 0; int total_mention_labeled = 0; diff --git a/md/src/main/java/org/cogcomp/md/MentionAnnotator.java b/md/src/main/java/org/cogcomp/md/MentionAnnotator.java index eaadd196e..2e661ed45 100644 --- a/md/src/main/java/org/cogcomp/md/MentionAnnotator.java +++ b/md/src/main/java/org/cogcomp/md/MentionAnnotator.java @@ -9,6 +9,7 @@ import edu.illinois.cs.cogcomp.annotation.Annotator; import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.Pair; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; @@ -167,8 +168,7 @@ else if (_mode.contains("ERE")){ try { Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); - gazetteers = (FlatGazetteers) GazetteersFactory.get(); + gazetteers = (FlatGazetteers) GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); Vector bcs = new Vector<>(); bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); bcs.add("brown-clusters" + File.separator + "brownBllipClusters"); @@ -181,8 +181,7 @@ else if (_mode.contains("ERE")){ bcsl.add(false); bcsl.add(false); bcsl.add(false); - BrownClusters.init(bcs, bcst, bcsl, false); - brownClusters = BrownClusters.get(); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java index ecb57eec8..257f85d9d 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java @@ -26,6 +26,7 @@ import java.io.FileNotFoundException; import java.io.InputStream; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.StringTokenizer; import java.util.Vector; @@ -33,95 +34,141 @@ public class BrownClusters { private static Logger logger = LoggerFactory.getLogger(BrownClusters.class); - /** the sole instance of this class. */ - private static BrownClusters brownclusters = null; - /** used to synchronize initialization. */ static private final String INIT_SYNC = "Brown Cluster Initialization Synchronization Token"; - - /** - * This method should never be called before init, or the gazetteer will not be initialized. - * - * @return the singleton instance of the Gazetteers class. - */ - static public BrownClusters get() { - synchronized (INIT_SYNC) { - return brownclusters; - } - } - - static public void set(BrownClusters bc){ - brownclusters = bc; - } /** ensures singleton-ness. */ private BrownClusters() { - } private boolean[] isLowercaseBrownClustersByResource = null; private ArrayList resources = null; private ArrayList> wordToPathByResource = null; - private final int[] prefixLengths = {4, 6, 10, 20}; + private final int[] prefixLengths = { 4, 6, 10, 20 }; + + /** clusters store, keyed on catenated paths. */ + static private HashMap clusters = new HashMap<>(); + + /** + * Makes a unique key based on the paths, for storage in a hashmap. + * @param pathsToClusterFiles the paths. + * @return the key. + */ + private static String getKey(Vector pathsToClusterFiles) { + ArrayList paths = new ArrayList<>(); + for (String path : pathsToClusterFiles) { + paths.add(path); + } + Collections.sort(paths); + StringBuffer sb = new StringBuffer(); + for (String path : paths) { + sb.append(path); + sb.append(" "); + } + return sb.toString(); + } /** - * Initialze the brown cluster data. This is a singleton, so this process is sychronized and - * atomic with resprect to the get() method above. + * Initialze the brown cluster data. Clusters are stored in a static data structure to avoid reloading the same (read-only) + * clusters over and over. * @param pathsToClusterFiles the files containing the data. * @param thresholds * @param isLowercaseBrownClusters */ - public static void init(Vector pathsToClusterFiles, Vector thresholds, - Vector isLowercaseBrownClusters, boolean useLocalBrownCluster) { - - try { - Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig()); - File bcDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false); - + public static BrownClusters get(Vector pathsToClusterFiles, Vector thresholds, Vector isLowercaseBrownClusters) { + boolean useLocalBrownCluster = true; + String key = null; synchronized (INIT_SYNC) { - brownclusters = new BrownClusters(); - brownclusters.isLowercaseBrownClustersByResource = - new boolean[isLowercaseBrownClusters.size()]; - brownclusters.wordToPathByResource = new ArrayList<>(); - brownclusters.resources = new ArrayList<>(); - for (int i = 0; i < pathsToClusterFiles.size(); i++) { - THashMap h = new THashMap<>(); - // We used to access the files as resources. Now we are accessing them programmatically. - // InFile in = new InFile(ResourceUtilities.loadResource(pathsToClusterFiles.elementAt(i))); - // Here we check if local resource is specified. - String bcFilePath = bcDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i); - if (useLocalBrownCluster){ - bcFilePath = pathsToClusterFiles.elementAt(i); - } - InputStream is = new FileInputStream(bcFilePath); - InFile in = new InFile(is); - String line = in.readLine(); - int wordsAdded = 0; - while (line != null) { - StringTokenizer st = new StringTokenizer(line); - String path = st.nextToken(); - String word = st.nextToken(); - int occ = Integer.parseInt(st.nextToken()); - if (occ >= thresholds.elementAt(i)) { - h.put(word, path); - wordsAdded++; + // first check for a cluster already loaded for this data. + key = getKey(pathsToClusterFiles); + if (!clusters.containsKey(key)) { + + // check to see if all the paths exist on the local file system. + for (String path : pathsToClusterFiles) { + if (!new File(path).exists()) { + useLocalBrownCluster = false; + break; } - line = in.readLine(); } - if (ParametersForLbjCode.currentParameters.debug) { - logger.info(wordsAdded + " words added"); + // create the cluster data structure. + BrownClusters brownclusters = new BrownClusters(); + brownclusters.isLowercaseBrownClustersByResource = new boolean[isLowercaseBrownClusters.size()]; + brownclusters.wordToPathByResource = new ArrayList<>(); + brownclusters.resources = new ArrayList<>(); + if (!useLocalBrownCluster) { + + // load everything from Minio + try { + Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig()); + File bcDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false); + for (int i = 0; i < pathsToClusterFiles.size(); i++) { + THashMap h = new THashMap<>(); + + // Here we check if local resource is specified. + String bcFilePath = bcDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i); + InputStream is = new FileInputStream(bcFilePath); + InFile in = new InFile(is); + String line = in.readLine(); + while (line != null) { + StringTokenizer st = new StringTokenizer(line); + String path = st.nextToken(); + String word = st.nextToken(); + int occ = Integer.parseInt(st.nextToken()); + if (occ >= thresholds.elementAt(i)) { + h.put(word, path); + } + line = in.readLine(); + } + + brownclusters.wordToPathByResource.add(h); + brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i); + brownclusters.resources.add(pathsToClusterFiles.elementAt(i)); + in.close(); + } + logger.info("Loaded brown cluster from "+key+" from Minio system."); + clusters.put(key, brownclusters); + } catch (InvalidPortException | InvalidEndpointException | DatastoreException + | FileNotFoundException e) { + throw new RuntimeException("Brown Clusters could not be loaded.", e); + } + } else { + + // load the clusters from the local file system. + try { + for (int i = 0; i < pathsToClusterFiles.size(); i++) { + THashMap h = new THashMap<>(); + + // Here we check if local resource is specified. + String bcFilePath = pathsToClusterFiles.elementAt(i); + InputStream is; + is = new FileInputStream(bcFilePath); + InFile in = new InFile(is); + String line = in.readLine(); + while (line != null) { + StringTokenizer st = new StringTokenizer(line); + String path = st.nextToken(); + String word = st.nextToken(); + int occ = Integer.parseInt(st.nextToken()); + if (occ >= thresholds.elementAt(i)) { + h.put(word, path); + } + line = in.readLine(); + } + brownclusters.wordToPathByResource.add(h); + brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i); + brownclusters.resources.add(pathsToClusterFiles.elementAt(i)); + in.close(); + } + logger.info("Loaded brown cluster from "+key+" from the local file system."); + clusters.put(key, brownclusters); + } catch (FileNotFoundException e) { + throw new RuntimeException("Brown Clusters files existed on local disk, but could not be loaded.", e); + } } - brownclusters.wordToPathByResource.add(h); - brownclusters.isLowercaseBrownClustersByResource[i] = - isLowercaseBrownClusters.elementAt(i); - brownclusters.resources.add(pathsToClusterFiles.elementAt(i)); - in.close(); } } - } catch (InvalidPortException | InvalidEndpointException | DatastoreException | FileNotFoundException e) { - e.printStackTrace(); - } + return clusters.get(key); } /** @@ -156,10 +203,10 @@ final public String[] getPrefixes(String word) { return res; } - final public String getPrefixesCombined(String word){ + final public String getPrefixesCombined(String word) { String[] cl = getPrefixes(word); String ret = ""; - for (String s : cl){ + for (String s : cl) { ret += s + ","; } return ret; @@ -184,12 +231,6 @@ final public void printOovData(Data data) { tokensHash.put(form, true); tokensHashIC.put(form.toLowerCase(), true); } - /* - * System.out.println("Data statistics:"); - * System.out.println("\t\t- Total tokens with repetitions ="+ totalTokens); - * System.out.println("\t\t- Total unique tokens ="+ tokensHash.size()); - * System.out.println("\t\t- Total unique tokens ignore case ="+ tokensHashIC.size()); - */ for (THashMap wordToPath : wordToPathByResource) { HashMap oovCaseSensitiveHash = new HashMap<>(); HashMap oovAfterLowercasingHash = new HashMap<>(); @@ -199,8 +240,7 @@ final public void printOovData(Data data) { if (!wordToPath.containsKey(form)) { oovCaseSensitiveHash.put(form, true); } - if ((!wordToPath.containsKey(form)) - && (!wordToPath.containsKey(form.toLowerCase()))) { + if ((!wordToPath.containsKey(form)) && (!wordToPath.containsKey(form.toLowerCase()))) { oovAfterLowercasingHash.put(form.toLowerCase(), true); } } @@ -208,21 +248,4 @@ final public void printOovData(Data data) { } } - - public static void main(String[] args) { - /* - * Vector resources=new Vector<>(); - * resources.addElement("Data/BrownHierarchicalWordClusters/brownBllipClusters"); - * Vector thres=new Vector<>(); thres.addElement(5); Vector lowercase=new - * Vector<>(); lowercase.addElement(false); init(resources,thres,lowercase); - * logger.info("finance "); printArr(getPrefixes(new NEWord(new - * Word("finance"),null,null))); logger.info("help"); printArr(getPrefixes(new - * NEWord(new Word("help"),null,null))); logger.info("resque "); - * printArr(getPrefixes(new NEWord(new Word("resque"),null,null))); - * logger.info("assist "); printArr(getPrefixes(new NEWord(new - * Word("assist"),null,null))); logger.info("assistance "); printArr(getPrefixes(new - * NEWord(new Word("assistance"),null,null))); logger.info("guidance "); - * printArr(getPrefixes(new NEWord(new Word("guidance"),null,null))); - */ - } } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java index d107b20ba..5110929a2 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java @@ -20,8 +20,8 @@ public class ContextAggregation { * that the data was annotated with dictionaries etc. */ public static void annotate(NEWord word) { - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("aggregateContext") - || ParametersForLbjCode.currentParameters.featuresToUse + if (word.params.featuresToUse.containsKey("aggregateContext") + || word.params.featuresToUse .containsKey("aggregateGazetteerMatches")) { int i = 0; NEWord w = word, last = word.nextIgnoreSentenceBoundary; @@ -57,7 +57,7 @@ public static void annotate(NEWord word) { updateFeatureCounts(word, "appearsDownCased"); if (w.form.equalsIgnoreCase(word.form) && Character.isUpperCase(w.form.charAt(0)) && Character.isUpperCase(word.form.charAt(0)) && word != w) { - if (ParametersForLbjCode.currentParameters.featuresToUse + if (word.params.featuresToUse .containsKey("aggregateContext")) { if (w.previous == null) updateFeatureCounts(word, "appearancesUpperStartSentence"); @@ -75,8 +75,8 @@ public static void annotate(NEWord word) { wtemp = wtemp.previousIgnoreSentenceBoundary; do { updateFeatureCounts(word, "context:" + j + ":" + wtemp.form); - if (BrownClusters.get().getResources() != null) { - String[] brownPaths = BrownClusters.get().getPrefixes(wtemp); + if (word.params.brownClusters.getResources() != null) { + String[] brownPaths = word.params.brownClusters.getPrefixes(wtemp); // for(int k=0;k 0) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ExpressiveFeaturesAnnotator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ExpressiveFeaturesAnnotator.java index bbd0ec1e2..1150d9e16 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ExpressiveFeaturesAnnotator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ExpressiveFeaturesAnnotator.java @@ -18,67 +18,34 @@ import java.util.Vector; public class ExpressiveFeaturesAnnotator { - /** - * Do not worry about the brown clusters and word embeddings, this stuff is added on the fly in - * the .lbj feature generators... - */ - public static void oldannotate(Data data, Gazetteers gaz) throws Exception { - // annotating with Gazetteers; - if (ParametersForLbjCode.currentParameters.featuresToUse != null) { - if (ParametersForLbjCode.currentParameters.featuresToUse - .containsKey("GazetteersFeatures")) { - - // first make sure the gazetteers arrays are inited for each word. - for (int docid = 0; docid < data.documents.size(); docid++) { - ArrayList sentences = data.documents.get(docid).sentences; - for (LinkedVector sentence : sentences) { - for (int j = 0; j < sentence.size(); j++) { - NEWord ww = (NEWord) sentence.get(j); - if (ww.gazetteers == null) - ww.gazetteers = new ArrayList<>(); - } - } - } - - // Annotating the data with gazetteers - for (int docid = 0; docid < data.documents.size(); docid++) { - ArrayList sentences = data.documents.get(docid).sentences; - for (LinkedVector sentence : sentences) { - for (int j = 0; j < sentence.size(); j++) - gaz.annotate((NEWord) sentence.get(j)); - } - } - } - } - } /** * Do not worry about the brown clusters and word embeddings, this stuff is added on the fly in * the .lbj feature generators... */ - public static void annotate(Data data) throws Exception { + public static void annotate(Data data, ParametersForLbjCode params) throws Exception { // logger.info("Annotating the data with expressive features..."); /* * must be after the linkability has been initialized!!! */ - if (ParametersForLbjCode.currentParameters.normalizeTitleText) { + if (params.normalizeTitleText) { // logger.info("Normalizing text case ..."); TitleTextNormalizer.normalizeCase(data); } - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("BrownClusterPaths")) { + if (params.featuresToUse.containsKey("BrownClusterPaths")) { // logger.info("Brown clusters OOV statistics:"); - BrownClusters.get().printOovData(data); + params.brownClusters.printOovData(data); } - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WordEmbeddings")) { + if (params.featuresToUse.containsKey("WordEmbeddings")) { // logger.info("Word Embeddings OOV statistics:"); WordEmbeddings.printOovData(data); } // annotating with Gazetteers; - if (ParametersForLbjCode.currentParameters.featuresToUse != null) { - if (ParametersForLbjCode.currentParameters.featuresToUse + if (params.featuresToUse != null) { + if (params.featuresToUse .containsKey("GazetteersFeatures")) { // first make sure the gazetteers arrays are inited for each word. for (int docid = 0; docid < data.documents.size(); docid++) { @@ -92,7 +59,7 @@ public static void annotate(Data data) throws Exception { } } - Gazetteers gaz = GazetteersFactory.get(); + Gazetteers gaz = params.gazetteers; for (int docid = 0; docid < data.documents.size(); docid++) { ArrayList sentences = data.documents.get(docid).sentences; for (LinkedVector vector : sentences) { @@ -123,17 +90,14 @@ public static void annotate(Data data) throws Exception { /* * Note that this piece of code must be the last!!! Here we are adding as features the * predictions of the aux models + * + * @redman I changed this considerable, since the properties are no longer static, this bit is easier */ - for (int i = 0; i < ParametersForLbjCode.currentParameters.auxiliaryModels.size(); i++) { - ParametersForLbjCode currentModel = ParametersForLbjCode.currentParameters; - ParametersForLbjCode.currentParameters = - ParametersForLbjCode.currentParameters.auxiliaryModels.elementAt(i); - Decoder.annotateDataBIO(data, - (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1, - (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2); + for (int i = 0; i < params.auxiliaryModels.size(); i++) { + Decoder.annotateDataBIO(data, params.auxiliaryModels.elementAt(i)); Vector v = new Vector<>(); v.addElement(data); - NETesterMultiDataset.printAllTestResultsAsOneDataset(v, false); + NETesterMultiDataset.printAllTestResultsAsOneDataset(v, false, params); TextChunkRepresentationManager.changeChunkRepresentation( TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, data, @@ -142,9 +106,6 @@ public static void annotate(Data data) throws Exception { TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, data, NEWord.LabelToLookAt.PredictionLevel2Tagger); - // addAuxiliaryClassifierFeatures(data, "aux_model_" + i); - - ParametersForLbjCode.currentParameters = currentModel; } } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/FlatGazetteers.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/FlatGazetteers.java index 20acc760c..9ac0ef165 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/FlatGazetteers.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/FlatGazetteers.java @@ -55,9 +55,6 @@ public FlatGazetteers(String pathToDictionaries) throws IOException { dictionaries = null; dictionariesIgnoreCase = null; dictionariesOneWordIgnorePunctuation = null; - if (ParametersForLbjCode.currentParameters.debug) { - logger.info("Loading gazetteers from path '" + pathToDictionaries + "'...."); - } ArrayList filenames = new ArrayList<>(); // List the Gazetteers directory (either local or in the classpath) // XXX Needed to add the dir listing file since there is no easy way to read inside the @@ -78,10 +75,6 @@ public FlatGazetteers(String pathToDictionaries) throws IOException { for (int i = 0; i < filenames.size(); i++) { String file = filenames.get(i); - - if (ParametersForLbjCode.currentParameters.debug) { - logger.info("\tloading gazetteer:...." + file); - } dictionaries.add(new THashSet()); dictionariesIgnoreCase.add(new THashSet()); dictionariesOneWordIgnorePunctuation.add(new THashSet()); @@ -116,9 +109,6 @@ public FlatGazetteers(String pathToDictionaries) throws IOException { in.close(); } } - if (ParametersForLbjCode.currentParameters.debug) { - logger.info("found " + dictionaries.size() + " gazetteers"); - } } public void annotate(NEWord w) { diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteerTree.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteerTree.java index 71a59f847..3a7a81078 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteerTree.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteerTree.java @@ -7,17 +7,15 @@ */ package edu.illinois.cs.cogcomp.ner.ExpressiveFeatures; -import edu.illinois.cs.cogcomp.core.constants.Language; -import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; -import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; -import gnu.trove.map.hash.THashMap; - import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; +import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; +import gnu.trove.map.hash.THashMap; + /** * this is a class optimized to match an expression that may be a word or a phrase. It is a THashSet * on the outside, but every match entry is an object that may indicate a match, or may require @@ -34,26 +32,7 @@ public class GazetteerTree { final private int maxPhraseLength; /** String splitter, by default split on 1 or more white space. */ - private StringSplitterInterface splitter = new StringSplitterInterface() { - @Override - public String[] split(String line) { - - // character tokenization for Chinese - if(ParametersForLbjCode.currentParameters.language == Language.Chinese) { - String[] chars = new String[line.length()]; - for(int i = 0; i < line.length(); i++) - chars[i] = String.valueOf(line.charAt(i)); - return chars; - } - else - return line.split("[\\s]+"); - } - - @Override - final public String normalize(String term) { - return term; - } - }; + private StringSplitterInterface splitter = null; /** * instances of this interface can be passed in to control how strings are split. If not @@ -99,6 +78,8 @@ class GazEntry { * or make the entry with match not true, and children. The first term passed in is the key, * so we won't store it here, but if there are subsequent terms, we need to construct a tree * path. + * @param entries the entries for this node. + * @param n dictionary names. */ GazEntry(String[] entries, DictionaryNames n) { if (entries.length == 1) { @@ -121,6 +102,9 @@ class GazEntry { /** * Term from a phrase, beyond the first term. If this is the last term, it indicates a * match, otherwise it does not, and the children must be constructed. + * @param entries the entries + * @param i the location in the list. + * @param n dict names. */ GazEntry(String[] entries, int i, DictionaryNames n) { if (entries.length == i) { @@ -287,15 +271,6 @@ public void trimToSize() { } } - /** - * Initialize the tree, data will be read via a separate call. - * - * @param phrase_length the max number of terms to match per phrase. - */ - GazetteerTree(int phrase_length) { - this.maxPhraseLength = phrase_length; - } - /** * Initialize the tree, data will be read via a separate call. * @@ -303,7 +278,7 @@ public void trimToSize() { * @param splitr this interface implementation will determin how we split strings. */ GazetteerTree(int phrase_length, StringSplitterInterface splitr) { - this(phrase_length); + this.maxPhraseLength = phrase_length; this.splitter = splitr; } @@ -323,23 +298,6 @@ public void trimToSize() { this.readDictionary(filename, suffix, res); } - /** - * Read the file, constructing an entry for each line in the file, for each line containing - * multiple words create a path farther down in the tree, with a match only at the end. If any - * entry already exists for the name, ensure the entry class is the correct one, add entries - * farther down if necessary. - * - * @param filename the directory containing the gazetteers - * @param phrase_length the max number of terms to match per phrase. - * @throws IOException - */ - GazetteerTree(int phrase_length, String filename, String suffix, InputStream res) - throws IOException { - this(phrase_length); - this.readDictionary(filename, suffix, res); - } - - /** * read the given dictionary file from the input stream. * diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java index 117640706..173a96d4f 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java @@ -11,8 +11,7 @@ import java.util.HashMap; import java.util.Map; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import edu.illinois.cs.cogcomp.core.constants.Language; /** * This singleton class contains all the gazetteer data and dictionaries. Can only be accessed via @@ -22,12 +21,6 @@ */ public class GazetteersFactory { - /** the logger. */ - static private Logger logger = LoggerFactory.getLogger(GazetteersFactory.class); - - /** the sole instance of this class. */ - private static Gazetteers gazetteers = null; - /** this is a token whose only use it to ensure thread safety. */ private static String GAZ_INIT_LOCK = "GAZ_INIT_LOCK"; @@ -37,11 +30,14 @@ public class GazetteersFactory { /** * Initialize the gazetteers. This method requires some exception handling, so the * initialization is separated from the fetching. - * + * @param maxPhraseLength the max number of tokens to keep for phrases. * @param path path to the gaz files. + * @param flatgazetteers if true, create a flat gaz, less memory, but slower. + * @param language the language. + * @return the gazetteer, newly created if we don't already have it. * @throws IOException */ - static public void init(int maxPhraseLength, String path, boolean flatgazetteers) + static public Gazetteers get(int maxPhraseLength, String path, boolean flatgazetteers, Language language) throws IOException { synchronized (GAZ_INIT_LOCK) { @@ -51,24 +47,10 @@ static public void init(int maxPhraseLength, String path, boolean flatgazetteers } } else { if (!gazetteers_map.containsKey(path) || gazetteers_map.get(path) instanceof FlatGazetteers) { - gazetteers_map.put(path, new TreeGazetteers(maxPhraseLength, path)); + gazetteers_map.put(path, new TreeGazetteers(maxPhraseLength, path, language)); } } - - gazetteers = gazetteers_map.get(path); + return gazetteers_map.get(path); } } - - /** - * This method should never be called before init, or the gazetteer will not be initialized. - * - * @return the singleton instance of the Gazetteers class. - */ - static public Gazetteers get() { - return gazetteers; - } - - static public void set(Gazetteers gaz){ - gazetteers = gaz; - } } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TitleTextNormalizer.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TitleTextNormalizer.java index 59504e0c7..35acdc977 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TitleTextNormalizer.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TitleTextNormalizer.java @@ -37,8 +37,6 @@ public class TitleTextNormalizer { public static HashMap lowercasedToNormalizedTokensMap = null; public static void init() { - if (!ParametersForLbjCode.currentParameters.normalizeTitleText) - return; InFile in = new InFile(pathToBrownClusterForWordFrequencies); String line = in.readLine(); lowercasedToNormalizedTokensMap = new HashMap<>(); @@ -64,19 +62,7 @@ public static void init() { } - public static void normalizeCaseData(Vector data) { - if (!ParametersForLbjCode.currentParameters.normalizeTitleText) - return; - if (lowercasedToNormalizedTokensMap == null) - init(); - for (int did = 0; did < data.size(); did++) - normalizeCase(data.elementAt(did)); - } - - public static void normalizeCase(Data data) { - if (!ParametersForLbjCode.currentParameters.normalizeTitleText) - return; if (lowercasedToNormalizedTokensMap == null) init(); // Below are the words that we'll want to normalize. We'll fill in the hashtable below with diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TreeGazetteers.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TreeGazetteers.java index 75e4e31d9..ca8c3a80f 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TreeGazetteers.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TreeGazetteers.java @@ -14,7 +14,6 @@ import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface; import edu.illinois.cs.cogcomp.ner.IO.ResourceUtilities; import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; -import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; import io.minio.errors.InvalidEndpointException; import io.minio.errors.InvalidPortException; import org.slf4j.Logger; @@ -47,8 +46,8 @@ public class TreeGazetteers implements Gazetteers { * @param pathToDictionaries the path to the gazetteers. * @throws IOException */ - TreeGazetteers(int phrase_length, String pathToDictionaries) throws IOException { - init(phrase_length, pathToDictionaries); + TreeGazetteers(int phrase_length, String pathToDictionaries, Language language) throws IOException { + init(phrase_length, pathToDictionaries, language); } /** @@ -58,7 +57,7 @@ public class TreeGazetteers implements Gazetteers { * @param phrase_length the max length of the phrases we will consider. * @throws IOException */ - private void init(int phrase_length, String pathToDictionaries) throws IOException { + private void init(int phrase_length, String pathToDictionaries, final Language language) throws IOException { try { // check the local file system for it. @@ -66,11 +65,13 @@ private void init(int phrase_length, String pathToDictionaries) throws IOExcepti String pathToLists = gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt"; InputStream stream = ResourceUtilities.loadResource(pathToLists); if (stream == null) { - + logger.info("Loading gazetteers from \""+pathToLists+"\" using the Minio cache."); // not in file system or classpath, try Minio. Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig()); gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.6, false); stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt"); + } else { + logger.info("Loading gazetteers from \""+pathToLists+"\" from the local file system."); } BufferedReader br = new BufferedReader(new InputStreamReader(stream)); String line; @@ -81,7 +82,25 @@ private void init(int phrase_length, String pathToDictionaries) throws IOExcepti // init the dictionaries. dictionaries = new ArrayList<>(filenames.size()); dictionariesIgnoreCase = new ArrayList<>(filenames.size()); - GazetteerTree gaz = new GazetteerTree(phrase_length); + GazetteerTree gaz = new GazetteerTree(phrase_length, new StringSplitterInterface() { + @Override + public String[] split(String line) { + + // character tokenization for Chinese + if(language == Language.Chinese) { + String[] chars = new String[line.length()]; + for(int i = 0; i < line.length(); i++) + chars[i] = String.valueOf(line.charAt(i)); + return chars; + } else + return line.split("[\\s]+"); + } + + @Override + final public String normalize(String term) { + return term; + } + }); GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() { @Override public String[] split(String line) { @@ -91,7 +110,7 @@ public String[] split(String line) { return new String[0]; else { // character tokenization for Chinese - if (ParametersForLbjCode.currentParameters.language == Language.Chinese) { + if (language == Language.Chinese) { String[] chars = new String[line.length()]; for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i)); @@ -117,9 +136,7 @@ public String normalize(String term) { gazIC.trimToSize(); dictionaries.add(gaz); dictionariesIgnoreCase.add(gazIC); - if (ParametersForLbjCode.currentParameters.debug) { - logger.info("found " + dictionaries.size() + " gazetteers"); - } + logger.info("Gazetteers from \""+pathToLists+"\" are loaded."); } catch (InvalidPortException | InvalidEndpointException e) { e.printStackTrace(); } catch (DatastoreException e) { diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TwoLayerPredictionAggregationFeatures.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TwoLayerPredictionAggregationFeatures.java index fcd89a32c..7ea96412d 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TwoLayerPredictionAggregationFeatures.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/TwoLayerPredictionAggregationFeatures.java @@ -50,7 +50,7 @@ public static void setLevel1AggregationFeatures(Data data, boolean useGoldData) * going to use the predictions as features */ private static void setLevel1AggregationFeatures(NEWord word, boolean useGoldData) { - ParametersForLbjCode parameters = ParametersForLbjCode.currentParameters; + ParametersForLbjCode parameters = word.params; // this used to be hard-coded to 0.1 double omissionRate = parameters.omissionRate; // this used to be hard-coded to 0.2 for right direction and 0.1 for left diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/Decoder.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/Decoder.java index 979ab7c55..9b5fce2b9 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/Decoder.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/Decoder.java @@ -26,50 +26,46 @@ public class Decoder { /** * If you don't wanna use some of the classifiers - pass null parameters. */ - public static void annotateDataBIO(Data data, NETaggerLevel1 taggerLevel1, - NETaggerLevel2 taggerLevel2) throws Exception { - Decoder.annotateBIO_AllLevelsWithTaggers(data, taggerLevel1, taggerLevel2); + public static void annotateDataBIO(Data data, ParametersForLbjCode params) throws Exception { + Decoder.annotateBIO_AllLevelsWithTaggers(data, params); } /** * use taggerLevel2=null if you want to use only one level of inference */ - protected static void annotateBIO_AllLevelsWithTaggers(Data data, NETaggerLevel1 taggerLevel1, - NETaggerLevel2 taggerLevel2) throws Exception { + protected static void annotateBIO_AllLevelsWithTaggers(Data data, ParametersForLbjCode params) throws Exception { clearPredictions(data); NETaggerLevel1.isTraining = false; NETaggerLevel2.isTraining = false; - GreedyDecoding.annotateGreedy(data, taggerLevel1, 1); + GreedyDecoding.annotateGreedy(data, params.taggerLevel1, 1); TextChunkRepresentationManager.changeChunkRepresentation( - ParametersForLbjCode.currentParameters.taggingEncodingScheme, + params.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel1Tagger); PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, - ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel1, + params.minConfidencePredictionsLevel1, NEWord.LabelToLookAt.PredictionLevel1Tagger); // this block runs the level2 tagger // Previously checked if features included 'PatternFeatures' - boolean level2 = - ParametersForLbjCode.currentParameters.featuresToUse - .containsKey("PredictionsLevel1"); - if (taggerLevel2 != null && level2) { + boolean level2 = params.featuresToUse.containsKey("PredictionsLevel1"); + if (params.taggerLevel2 != null && level2) { // annotate with patterns PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, 0.0, NEWord.LabelToLookAt.PredictionLevel1Tagger); TwoLayerPredictionAggregationFeatures.setLevel1AggregationFeatures(data, false); - GreedyDecoding.annotateGreedy(data, taggerLevel2, 2); + GreedyDecoding.annotateGreedy(data, params.taggerLevel2, 2); PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, - ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel2, + params.minConfidencePredictionsLevel2, NEWord.LabelToLookAt.PredictionLevel2Tagger); TextChunkRepresentationManager.changeChunkRepresentation( - ParametersForLbjCode.currentParameters.taggingEncodingScheme, + params.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel2Tagger); } else { diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Data.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Data.java index 8f44bb7d0..a3330a3ee 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Data.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Data.java @@ -25,7 +25,6 @@ public class Data { HashSet labelsToAnonymizeForEvaluation = new HashSet<>(); private Data(Data other) { - // no copy!!! } @@ -42,38 +41,37 @@ public Data(NERDocument doc) { public Data(String pathToData, String nickname, String dataFormat, Vector labelsToIgnoreForEvaluation, - Vector labelsToAnonymizeForEvaluation) throws Exception { + Vector labelsToAnonymizeForEvaluation, ParametersForLbjCode params) throws Exception { this.datasetPath = pathToData; this.nickname = nickname; this.pathToData = pathToData; if ((new File(pathToData)).isDirectory()) { - Vector docs = TaggedDataReader.readFolder(pathToData, dataFormat); + Vector docs = TaggedDataReader.readFolder(pathToData, dataFormat, params); for (int i = 0; i < docs.size(); i++) documents.add(docs.elementAt(i)); } else { int idx = Math.max(Math.max(0, pathToData.lastIndexOf("/")), pathToData.lastIndexOf('\\')); String docname = pathToData.substring(idx); - documents.add(TaggedDataReader.readFile(pathToData, dataFormat, docname)); + documents.add(TaggedDataReader.readFile(pathToData, dataFormat, docname, params)); } setLabelsToIgnore(labelsToIgnoreForEvaluation); setLabelsToAnonymize(labelsToAnonymizeForEvaluation); } - public Data(String pathToData, String nickname, String dataFormat, - String[] labelsToIgnoreForEvaluation, String[] labelsToAnonymizeForEvaluation) - throws Exception { + public Data(String pathToData, String nickname, String dataFormat, String[] labelsToIgnoreForEvaluation, + String[] labelsToAnonymizeForEvaluation, ParametersForLbjCode params) throws Exception { this.datasetPath = pathToData; this.nickname = nickname; if ((new File(pathToData)).isDirectory()) { - Vector docs = TaggedDataReader.readFolder(pathToData, dataFormat); + Vector docs = TaggedDataReader.readFolder(pathToData, dataFormat, params); for (int i = 0; i < docs.size(); i++) documents.add(docs.elementAt(i)); } else { int idx = Math.max(Math.max(0, pathToData.lastIndexOf("/")), pathToData.lastIndexOf('\\')); String docname = pathToData.substring(idx); - documents.add(TaggedDataReader.readFile(pathToData, dataFormat, docname)); + documents.add(TaggedDataReader.readFile(pathToData, dataFormat, docname, params)); } setLabelsToIgnore(labelsToIgnoreForEvaluation); setLabelsToAnonymize(labelsToAnonymizeForEvaluation); diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java index b8dee1ccc..eda10d002 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java @@ -46,13 +46,13 @@ public class LearningCurveMultiDataset { * @param devDataPath data used to auto-converge. */ public static void buildFinalModel(int fixedNumIterations, String trainDataPath, - String testDataPath, String devDataPath, boolean incremental) throws Exception { - Data trainData = new Data(trainDataPath, trainDataPath, "-c", new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(trainData); - Data testData = new Data(testDataPath, testDataPath, "-c", new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(testData); - Data devData = new Data(devDataPath, devDataPath, "-c", new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(devData); + String testDataPath, String devDataPath, boolean incremental, ParametersForLbjCode params) throws Exception { + Data trainData = new Data(trainDataPath, trainDataPath, "-c", new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(trainData, params); + Data testData = new Data(testDataPath, testDataPath, "-c", new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(testData, params); + Data devData = new Data(devDataPath, devDataPath, "-c", new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(devData, params); Vector train = new Vector<>(); train.addElement(trainData); train.addElement(testData); @@ -60,7 +60,7 @@ public static void buildFinalModel(int fixedNumIterations, String trainDataPath, test.addElement(devData); logger.debug("Building final model: iterations = " + fixedNumIterations + " train = '" + trainDataPath + "' test = '"+testDataPath+"' dev = '" + testDataPath+"'"); - getLearningCurve(train, test, fixedNumIterations, incremental); + getLearningCurve(train, test, fixedNumIterations, incremental, params); } /** @@ -71,9 +71,9 @@ public static void buildFinalModel(int fixedNumIterations, String trainDataPath, * @param incremental if the model is being incremented, this is true. * @throws Exception */ - public static void getLearningCurve(int fixedNumIterations, String trainDataPath, - String testDataPath, boolean incremental) throws Exception { - getLearningCurve(fixedNumIterations, trainDataPath, "-c", testDataPath, incremental); + public static void getLearningCurve(int fixedNumIterations, String trainDataPath, String testDataPath, + boolean incremental, ParametersForLbjCode params) throws Exception { + getLearningCurve(fixedNumIterations, "-c", trainDataPath, testDataPath, incremental, params); } /** @@ -92,20 +92,20 @@ public static void getLearningCurve(int fixedNumIterations, String trainDataPath * @throws Exception */ public static void getLearningCurve(int fixedNumIterations, String dataFormat, String trainDataPath, - String testDataPath, boolean incremental) throws Exception { + String testDataPath, boolean incremental, ParametersForLbjCode params) throws Exception { logger.debug("getLearningCurve(): fni = " + fixedNumIterations + " trainDataPath = '" + trainDataPath + "' testDataPath = '" + testDataPath + "'...."); Data trainData = - new Data(trainDataPath, trainDataPath, dataFormat, new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(trainData); + new Data(trainDataPath, trainDataPath, dataFormat, new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(trainData, params); Data testData = - new Data(testDataPath, testDataPath, dataFormat, new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(testData); + new Data(testDataPath, testDataPath, dataFormat, new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(testData, params); Vector train = new Vector<>(); train.addElement(trainData); Vector test = new Vector<>(); test.addElement(testData); - getLearningCurve(train, test, fixedNumIterations, incremental); + getLearningCurve(train, test, fixedNumIterations, incremental, params); } /** @@ -120,11 +120,11 @@ public static void getLearningCurve(int fixedNumIterations, String dataFormat, S * @throws Exception */ public static void getLearningCurve(Vector trainDataSet, Vector testDataSet, - int fixedNumIterations, boolean incremental) throws Exception { + int fixedNumIterations, boolean incremental, ParametersForLbjCode params) throws Exception { double bestF1Level1 = -1; int bestRoundLevel1 = 0; // Get the directory name (.model is appended in LbjTagger/Parameters.java:139) - String modelPath = ParametersForLbjCode.currentParameters.pathToModelFile; + String modelPath = params.pathToModelFile; String modelPathDir = modelPath.substring(0, modelPath.lastIndexOf("/")); if (IOUtils.exists(modelPathDir)) { if (!IOUtils.isDirectory(modelPathDir)) { @@ -142,11 +142,11 @@ public static void getLearningCurve(Vector trainDataSet, Vector test NETaggerLevel1.Parameters paramLevel1 = new NETaggerLevel1.Parameters(); paramLevel1.baseLTU = new SparseAveragedPerceptron( - ParametersForLbjCode.currentParameters.learningRatePredictionsLevel1, 0, - ParametersForLbjCode.currentParameters.thicknessPredictionsLevel1); - paramLevel1.baseLTU.featurePruningThreshold = ParametersForLbjCode.currentParameters.featurePruningThreshold; - logger.info("Level 1 classifier learning rate = "+ParametersForLbjCode.currentParameters.learningRatePredictionsLevel1+ - ", thickness = "+ParametersForLbjCode.currentParameters.thicknessPredictionsLevel1); + params.learningRatePredictionsLevel1, 0, + params.thicknessPredictionsLevel1); + paramLevel1.baseLTU.featurePruningThreshold = params.featurePruningThreshold; + logger.info("Level 1 classifier learning rate = "+params.learningRatePredictionsLevel1+ + ", thickness = "+params.thicknessPredictionsLevel1); NETaggerLevel1 tagger1 = new NETaggerLevel1(paramLevel1, modelPath + ".level1", modelPath + ".level1.lex"); @@ -156,11 +156,10 @@ public static void getLearningCurve(Vector trainDataSet, Vector test } else { logger.info("Training L1 model incrementally."); } - ParametersForLbjCode.currentParameters.taggerLevel1 = tagger1; + params.taggerLevel1 = tagger1; for (int dataId = 0; dataId < trainDataSet.size(); dataId++) { Data trainData = trainDataSet.elementAt(dataId); - if (ParametersForLbjCode.currentParameters.featuresToUse - .containsKey("PredictionsLevel1")) { + if (params.featuresToUse.containsKey("PredictionsLevel1")) { PredictionsAndEntitiesConfidenceScores.getAndMarkEntities(trainData, NEWord.LabelToLookAt.GoldLabel); TwoLayerPredictionAggregationFeatures.setLevel1AggregationFeatures(trainData, true); @@ -168,7 +167,7 @@ public static void getLearningCurve(Vector trainDataSet, Vector test } // preextract the L1 test and train data. - String path = ParametersForLbjCode.currentParameters.pathToModelFile; + String path = params.pathToModelFile; String trainPathL1 = path + ".level1.prefetchedTrainData"; File deleteme = new File(trainPathL1); if (deleteme.exists()) @@ -178,9 +177,9 @@ public static void getLearningCurve(Vector trainDataSet, Vector test if (deleteme.exists()) deleteme.delete(); logger.info("Pre-extracting the training data for Level 1 classifier, saving to "+trainPathL1); - BatchTrainer bt1train = prefetchAndGetBatchTrainer(tagger1, trainDataSet, trainPathL1); + BatchTrainer bt1train = prefetchAndGetBatchTrainer(tagger1, trainDataSet, trainPathL1, params); logger.info("Pre-extracting the testing data for Level 1 classifier, saving to "+testPathL1); - BatchTrainer bt1test = prefetchAndGetBatchTrainer(tagger1, testDataSet, testPathL1); + BatchTrainer bt1test = prefetchAndGetBatchTrainer(tagger1, testDataSet, testPathL1, params); Parser testParser1 = bt1test.getParser(); // create the best model possible. @@ -204,7 +203,7 @@ public static void getLearningCurve(Vector trainDataSet, Vector test logger.info(i + " rounds. Best so far for Level1 : (" + bestRoundLevel1 + ")=" + bestF1Level1); } - saveme.getBaseLTU().featurePruningThreshold = ParametersForLbjCode.currentParameters.featurePruningThreshold; + saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold; saveme.doneTraining(); saveme.save(); logger.info("Level 1; best round : " + bestRoundLevel1 + "\tbest F1 : " + bestF1Level1); @@ -225,12 +224,12 @@ public static void getLearningCurve(Vector trainDataSet, Vector test NETaggerLevel2.Parameters paramLevel2 = new NETaggerLevel2.Parameters(); paramLevel2.baseLTU = new SparseAveragedPerceptron( - ParametersForLbjCode.currentParameters.learningRatePredictionsLevel2, 0, - ParametersForLbjCode.currentParameters.thicknessPredictionsLevel2); - paramLevel2.baseLTU.featurePruningThreshold = ParametersForLbjCode.currentParameters.featurePruningThreshold; + params.learningRatePredictionsLevel2, 0, + params.thicknessPredictionsLevel2); + paramLevel2.baseLTU.featurePruningThreshold = params.featurePruningThreshold; NETaggerLevel2 tagger2 = - new NETaggerLevel2(paramLevel2, ParametersForLbjCode.currentParameters.pathToModelFile - + ".level2", ParametersForLbjCode.currentParameters.pathToModelFile + new NETaggerLevel2(paramLevel2, params.pathToModelFile + + ".level2", params.pathToModelFile + ".level2.lex"); if (!incremental) { logger.info("Training L2 model from scratch."); @@ -238,20 +237,20 @@ public static void getLearningCurve(Vector trainDataSet, Vector test } else { logger.info("Training L2 model incrementally."); } - ParametersForLbjCode.currentParameters.taggerLevel2 = tagger2; + params.taggerLevel2 = tagger2; // Previously checked if PatternFeatures was in featuresToUse. - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1")) { - logger.info("Level 2 classifier learning rate = "+ParametersForLbjCode.currentParameters.learningRatePredictionsLevel2+ - ", thickness = "+ParametersForLbjCode.currentParameters.thicknessPredictionsLevel2); + if (params.featuresToUse.containsKey("PredictionsLevel1")) { + logger.info("Level 2 classifier learning rate = "+params.learningRatePredictionsLevel2+ + ", thickness = "+params.thicknessPredictionsLevel2); double bestF1Level2 = -1; int bestRoundLevel2 = 0; logger.info("Pre-extracting the training data for Level 2 classifier, saving to "+trainPathL2); BatchTrainer bt2train = - prefetchAndGetBatchTrainer(tagger2, trainDataSet, trainPathL2); + prefetchAndGetBatchTrainer(tagger2, trainDataSet, trainPathL2, params); logger.info("Pre-extracting the testing data for Level 2 classifier, saving to "+testPathL2); BatchTrainer bt2test = - prefetchAndGetBatchTrainer(tagger2, testDataSet, testPathL2); + prefetchAndGetBatchTrainer(tagger2, testDataSet, testPathL2, params); Parser testParser2 = bt2test.getParser(); // create the best model possible. @@ -277,7 +276,7 @@ public static void getLearningCurve(Vector trainDataSet, Vector test logger.info(i + " rounds. Best so far for Level2 : (" + bestRoundLevel2 + ") " + bestF1Level2); } - saveme.getBaseLTU().featurePruningThreshold = ParametersForLbjCode.currentParameters.featurePruningThreshold; + saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold; saveme.doneTraining(); saveme.save(); } @@ -294,7 +293,7 @@ public static void getLearningCurve(Vector trainDataSet, Vector test + "\t Level2: bestround=" + bestRoundLevel2 + "\t F1=" + bestF1Level2); } - NETesterMultiDataset.printTestResultsByDataset(testDataSet, tagger1, tagger2, true); + NETesterMultiDataset.printTestResultsByDataset(testDataSet, tagger1, tagger2, true, params); /* * This will override the models forcing to save the iteration we're interested in- the @@ -313,12 +312,12 @@ public static void getLearningCurve(Vector trainDataSet, Vector test * samples- use 100 partitions otherwise, the zip doesn't work on training files larger than 4G */ private static BatchTrainer prefetchAndGetBatchTrainer(SparseNetworkLearner classifier, - Vector dataSets, String exampleStorePath) { + Vector dataSets, String exampleStorePath, ParametersForLbjCode params) { for (int dataId = 0; dataId < dataSets.size(); dataId++) { Data data = dataSets.elementAt(dataId); TextChunkRepresentationManager.changeChunkRepresentation( TextChunkRepresentationManager.EncodingScheme.BIO, - ParametersForLbjCode.currentParameters.taggingEncodingScheme, data, + params.taggingEncodingScheme, data, NEWord.LabelToLookAt.GoldLabel); } BatchTrainer bt = new BatchTrainer(classifier, new SampleReader(dataSets), 0); @@ -329,7 +328,7 @@ private static BatchTrainer prefetchAndGetBatchTrainer(SparseNetworkLearner clas for (int dataId = 0; dataId < dataSets.size(); dataId++) { Data trainData = dataSets.elementAt(dataId); TextChunkRepresentationManager.changeChunkRepresentation( - ParametersForLbjCode.currentParameters.taggingEncodingScheme, + params.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, trainData, NEWord.LabelToLookAt.GoldLabel); } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEDisplayPredictions.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEDisplayPredictions.java index 9e3d4e2e5..3c9818939 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEDisplayPredictions.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEDisplayPredictions.java @@ -30,17 +30,15 @@ public class NEDisplayPredictions { * @param verbose report more. * @throws Exception */ - public static void test(String testDatapath, String dataFormat, boolean verbose) + public static void test(String testDatapath, String dataFormat, boolean verbose, ParametersForLbjCode params) throws Exception { Data testData = - new Data(testDatapath, testDatapath, dataFormat, new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(testData); + new Data(testDatapath, testDatapath, dataFormat, new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(testData, params); Vector data = new Vector<>(); data.addElement(testData); - NETaggerLevel1 t1 = (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1; - NETaggerLevel2 t2 = (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2; for (int i = 0; i < data.size(); i++) - Decoder.annotateDataBIO(data.elementAt(i), t1, t2); + Decoder.annotateDataBIO(data.elementAt(i), params); reportPredictions(data.get(0)); } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETagPlain.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETagPlain.java index 948fb2088..544d0817a 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETagPlain.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETagPlain.java @@ -7,7 +7,13 @@ */ package edu.illinois.cs.cogcomp.ner.LbjTagger; -import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import java.io.File; +import java.util.ArrayList; +import java.util.Vector; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.ExpressiveFeaturesAnnotator; import edu.illinois.cs.cogcomp.ner.IO.OutFile; @@ -15,31 +21,12 @@ import edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel1; import edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel2; import edu.illinois.cs.cogcomp.ner.ParsingProcessingData.PlainTextReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.util.ArrayList; -import java.util.Date; -import java.util.Vector; public class NETagPlain { private static final String NAME = NETagPlain.class.getCanonicalName(); - private static NETaggerLevel1 tagger1 = null; - private static NETaggerLevel2 tagger2 = null; - private static Logger logger = LoggerFactory.getLogger(NETagPlain.class); - /** - * assumes ParametersForLbjCode has been initialized - */ - public static void init() { - String modelFile = ParametersForLbjCode.currentParameters.pathToModelFile; - tagger1 = (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1; - tagger2 = (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2; - } - /** * Does this assume that {@link #init()} has been called already? * @@ -47,7 +34,7 @@ public static void init() { * @param outputPath * @throws Exception */ - public static void tagData(String inputPath, String outputPath) throws Exception { + public static void tagData(String inputPath, String outputPath, ParametersForLbjCode params) throws Exception { File f = new File(inputPath); Vector inFiles = new Vector<>(); Vector outFiles = new Vector<>(); @@ -65,13 +52,13 @@ public static void tagData(String inputPath, String outputPath) throws Exception for (int fileId = 0; fileId < inFiles.size(); fileId++) { logger.debug("Tagging file: " + inFiles.elementAt(fileId)); ArrayList sentences = - PlainTextReader.parsePlainTextFile(inFiles.elementAt(fileId)); + PlainTextReader.parsePlainTextFile(inFiles.elementAt(fileId), params); NERDocument doc = new NERDocument(sentences, "consoleInput"); Data data = new Data(doc); - ExpressiveFeaturesAnnotator.annotate(data); + ExpressiveFeaturesAnnotator.annotate(data, params); // formerly there was code to load models here. Check that NETagPlain.init() is // happening. - String tagged = tagData(data, tagger1, tagger2); + String tagged = tagData(data, params); OutFile out = new OutFile(outFiles.elementAt(fileId)); out.println(tagged); out.close(); @@ -97,63 +84,57 @@ public static void tagData(String inputPath, String outputPath) throws Exception * ) * @throws Exception */ - public static String tagLine(String line) throws Exception { + public static String tagLine(String line, ParametersForLbjCode params) throws Exception { logger.debug(NAME + ".tagLine(): tagging input '" + line + "'..."); - ArrayList sentences = PlainTextReader.parseText(line); + ArrayList sentences = PlainTextReader.parseText(line, params); // NOTICE: this only checks tagger1 because tagger2 may legally be null. - if (tagger1 == null) { + if (params.taggerLevel1 == null) { logger.error("Tagger1 is null. You may need to call NETagPlain.init() first."); return ""; } - return tagSentenceVector(sentences, tagger1, tagger2); + return tagSentenceVector(sentences, params); } - public static AnnotatedDocument getAnnotatedDocument(String input) throws Exception { - ArrayList sentences = PlainTextReader.parseText(input); + public static AnnotatedDocument getAnnotatedDocument(String input, ParametersForLbjCode params) throws Exception { + ArrayList sentences = PlainTextReader.parseText(input, params); NERDocument doc = new NERDocument(sentences, "consoleInput"); Data data = new Data(doc); - ExpressiveFeaturesAnnotator.annotate(data); + ExpressiveFeaturesAnnotator.annotate(data, params); // NOTICE: this only checks tagger1 because tagger2 may legally be null. - if (tagger1 == null) { + if (params.taggerLevel1 == null) { logger.error("Tagger1 is null. You may need to call NETagPlain.init() first."); return null; } - Decoder.annotateDataBIO(data, tagger1, tagger2); + Decoder.annotateDataBIO(data, params); return new AnnotatedDocument(data); } - public static String tagLine(String line, NETaggerLevel1 tagger1, NETaggerLevel2 tagger2) + public static String tagLine(String line, NETaggerLevel1 tagger1, NETaggerLevel2 tagger2, ParametersForLbjCode params) throws Exception { - ArrayList sentences = PlainTextReader.parseText(line); - return tagSentenceVector(sentences, tagger1, tagger2); + ArrayList sentences = PlainTextReader.parseText(line, params); + return tagSentenceVector(sentences, params); } - public static String tagTextFromFile(String line, NETaggerLevel1 tagger1, NETaggerLevel2 tagger2) + public static String tagTextFromFile(String line, ParametersForLbjCode params) throws Exception { - ArrayList sentences = PlainTextReader.parsePlainTextFile(line); - return tagSentenceVector(sentences, tagger1, tagger2); + ArrayList sentences = PlainTextReader.parsePlainTextFile(line, params); + return tagSentenceVector(sentences, params); } - public static String tagSentenceVector(ArrayList sentences, - NETaggerLevel1 tagger1, NETaggerLevel2 tagger2) throws Exception { + public static String tagSentenceVector(ArrayList sentences, ParametersForLbjCode params) throws Exception { NERDocument doc = new NERDocument(sentences, "consoleInput"); Data data = new Data(doc); - return tagData(data, tagger1, tagger2); + return tagData(data, params); } - public static String tagData(Data data) throws Exception { - return tagData(data, tagger1, tagger2); - } - - public static String tagData(Data data, NETaggerLevel1 tagger1, NETaggerLevel2 tagger2) - throws Exception { - ExpressiveFeaturesAnnotator.annotate(data); - Decoder.annotateDataBIO(data, tagger1, tagger2); + public static String tagData(Data data, ParametersForLbjCode params) throws Exception { + ExpressiveFeaturesAnnotator.annotate(data, params); + Decoder.annotateDataBIO(data, params); StringBuffer res = new StringBuffer(); for (int docid = 0; docid < data.documents.size(); docid++) { diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETesterMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETesterMultiDataset.java index 6328b9e0e..edf9daf51 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETesterMultiDataset.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETesterMultiDataset.java @@ -30,10 +30,10 @@ public class NETesterMultiDataset { /** * NB: assuming column format */ - public static void test(String testDatapath, boolean verbose, - Vector labelsToIgnoreInEvaluation, Vector labelsToAnonymizeInEvaluation) + public static void test(String testDatapath, boolean verbose, Vector labelsToIgnoreInEvaluation, + Vector labelsToAnonymizeInEvaluation, ParametersForLbjCode params) throws Exception { - test(testDatapath,verbose, "-c", labelsToIgnoreInEvaluation, labelsToAnonymizeInEvaluation); + test(testDatapath,verbose, "-c", labelsToIgnoreInEvaluation, labelsToAnonymizeInEvaluation, params); } /** @@ -45,12 +45,12 @@ public static void test(String testDatapath, boolean verbose, * @param labelsToAnonymizeInEvaluation * @throws Exception */ - public static void test(String testDatapath, boolean verbose, String dataFormat, - Vector labelsToIgnoreInEvaluation, Vector labelsToAnonymizeInEvaluation) + public static void test(String testDatapath, boolean verbose, String dataFormat, Vector labelsToIgnoreInEvaluation, + Vector labelsToAnonymizeInEvaluation, ParametersForLbjCode params) throws Exception { Data testData = - new Data(testDatapath, testDatapath, dataFormat, new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(testData); + new Data(testDatapath, testDatapath, dataFormat, new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(testData, params); Vector data = new Vector<>(); data.addElement(testData); @@ -58,26 +58,26 @@ public static void test(String testDatapath, boolean verbose, String dataFormat, data.elementAt(0).setLabelsToIgnore(labelsToIgnoreInEvaluation); if (labelsToAnonymizeInEvaluation != null) data.elementAt(0).setLabelsToAnonymize(labelsToAnonymizeInEvaluation); - NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1; - NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2; + NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) params.taggerLevel1; + NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) params.taggerLevel2; SparseAveragedPerceptron sap1 = (SparseAveragedPerceptron)taggerLevel1.getBaseLTU(); System.out.println("L1 SparseAveragedPerceptron learning rate = "+sap1.getLearningRate()+", thickness = "+sap1.getPositiveThickness()); - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1")) { + if (params.featuresToUse.containsKey("PredictionsLevel1")) { SparseAveragedPerceptron sap2 = (SparseAveragedPerceptron)taggerLevel2.getBaseLTU(); System.out.println("L2 SparseAveragedPerceptron learning rate = "+sap2.getLearningRate()+", thickness = "+sap2.getPositiveThickness()); } - printTestResultsByDataset(data, taggerLevel1, taggerLevel2, verbose); + printTestResultsByDataset(data, taggerLevel1, taggerLevel2, verbose, params); } /** * NB: assuming column format */ - public static void dumpFeaturesLabeledData(String testDatapath, String outDatapath) + public static void dumpFeaturesLabeledData(String testDatapath, String outDatapath, ParametersForLbjCode params) throws Exception { FeaturesLevel1SharedWithLevel2 features1 = new FeaturesLevel1SharedWithLevel2(); FeaturesLevel2 features2 = new FeaturesLevel2(); - NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1; - NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2; + NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) params.taggerLevel1; + NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) params.taggerLevel2; File f = new File(testDatapath); Vector inFiles = new Vector<>(); Vector outFiles = new Vector<>(); @@ -95,9 +95,9 @@ public static void dumpFeaturesLabeledData(String testDatapath, String outDatapa for (int fileId = 0; fileId < inFiles.size(); fileId++) { Data testData = new Data(inFiles.elementAt(fileId), inFiles.elementAt(fileId), "-c", - new String[] {}, new String[] {}); - ExpressiveFeaturesAnnotator.annotate(testData); - Decoder.annotateDataBIO(testData, taggerLevel1, taggerLevel2); + new String[] {}, new String[] {}, params); + ExpressiveFeaturesAnnotator.annotate(testData, params); + Decoder.annotateDataBIO(testData, params); OutFile out = new OutFile(outFiles.elementAt(fileId)); for (int docid = 0; docid < testData.documents.size(); docid++) { ArrayList sentences = testData.documents.get(docid).sentences; @@ -125,17 +125,17 @@ public static void dumpFeaturesLabeledData(String testDatapath, String outDatapa } public static Vector printTestResultsByDataset(Vector dataCollection, - NETaggerLevel1 tagger1, NETaggerLevel2 tagger2, boolean verbose) throws Exception { + NETaggerLevel1 tagger1, NETaggerLevel2 tagger2, boolean verbose, ParametersForLbjCode params) throws Exception { for (int i = 0; i < dataCollection.size(); i++) - Decoder.annotateDataBIO(dataCollection.elementAt(i), tagger1, tagger2); - return printTestResultsByDataset(dataCollection, verbose); + Decoder.annotateDataBIO(dataCollection.elementAt(i), params); + return printTestResultsByDataset(dataCollection, verbose, params); } public static TestDiscrete[] printAllTestResultsAsOneDataset(Vector dataCollection, - NETaggerLevel1 tagger1, NETaggerLevel2 tagger2, boolean verbose) throws Exception { + NETaggerLevel1 tagger1, NETaggerLevel2 tagger2, boolean verbose, ParametersForLbjCode params) throws Exception { for (int i = 0; i < dataCollection.size(); i++) - Decoder.annotateDataBIO(dataCollection.elementAt(i), tagger1, tagger2); - return printAllTestResultsAsOneDataset(dataCollection, verbose); + Decoder.annotateDataBIO(dataCollection.elementAt(i), params); + return printAllTestResultsAsOneDataset(dataCollection, verbose, params); } @@ -143,7 +143,7 @@ public static TestDiscrete[] printAllTestResultsAsOneDataset(Vector dataCo * assumes that the data has been annotated by both levels of taggers */ public static Vector printTestResultsByDataset(Vector dataCollection, - boolean verbose) { + boolean verbose, ParametersForLbjCode params) { Vector res = new Vector<>(); for (int dataSetId = 0; dataSetId < dataCollection.size(); dataSetId++) { TestDiscrete resultsPhraseLevel1 = new TestDiscrete(); @@ -171,7 +171,7 @@ public static Vector printTestResultsByDataset(Vector data System.out.println("****** Performance on dataset " + dataCollection.elementAt(dataSetId).datasetPath + " **********"); System.out.println("------------------------------------------------------------"); - if (ParametersForLbjCode.currentParameters.featuresToUse + if (params.featuresToUse .containsKey("PredictionsLevel1")) { System.out.println("Phrase-level Acc Level2:"); resultsPhraseLevel2.printPerformance(System.out); @@ -194,7 +194,7 @@ public static Vector printTestResultsByDataset(Vector data System.out.println(">>>>>>>>> Phrase-level F1 on the dataset: " + dataCollection.elementAt(dataSetId).datasetPath); System.out.println("\t Level 1: " + resultsPhraseLevel1.getOverallStats()[2]); - if (ParametersForLbjCode.currentParameters.featuresToUse + if (params.featuresToUse .containsKey("PredictionsLevel1")) System.out.println("\t Level 2: " + resultsPhraseLevel2.getOverallStats()[2]); } @@ -206,7 +206,7 @@ public static Vector printTestResultsByDataset(Vector data * assumes that the data has been annotated by both levels of taggers */ public static TestDiscrete[] printAllTestResultsAsOneDataset(Vector dataCollection, - boolean verbose) { + boolean verbose, ParametersForLbjCode params) { TestDiscrete resultsPhraseLevel1 = new TestDiscrete(); resultsPhraseLevel1.addNull("O"); TestDiscrete resultsTokenLevel1 = new TestDiscrete(); @@ -233,7 +233,7 @@ public static TestDiscrete[] printAllTestResultsAsOneDataset(Vector dataCo System.out.println("\t>>> Dataset path : \t" + dataCollection.elementAt(i).datasetPath); System.out.println("------------------------------------------------------------"); if (verbose) { - if (ParametersForLbjCode.currentParameters.featuresToUse + if (params.featuresToUse .containsKey("PredictionsLevel1")) { System.out.println("Phrase-level Acc Level2:"); resultsPhraseLevel2.printPerformance(System.out); @@ -250,7 +250,7 @@ public static TestDiscrete[] printAllTestResultsAsOneDataset(Vector dataCo resultsTokenLevel1.printPerformance(System.out); } else { System.out.println("\t Level 1: " + resultsPhraseLevel1.getOverallStats()[2]); - if (ParametersForLbjCode.currentParameters.featuresToUse + if (params.featuresToUse .containsKey("PredictionsLevel1")) System.out.println("\t Level 2: " + resultsPhraseLevel2.getOverallStats()[2]); } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java index 0e0f55a5b..5066d7d80 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java @@ -28,7 +28,7 @@ public class NEWord extends Word { public String neTypeLevel1; public String neTypeLevel2; public NamedEntity predictedEntity = null;// if non-null it keeps the named entity the tagger - // annotated this word with + public ParametersForLbjCode params = null; public CharacteristicWords predictionConfidencesLevel1Classifier = null; public CharacteristicWords predictionConfidencesLevel2Classifier = null; public NamedEntity goldEntity = null;// if non-null it keeps the named entity the tagger @@ -89,28 +89,29 @@ public NEWord(Word w, NEWord p, String type) { * @param token the individual token. * @param tag the tag to annotate the word with. */ - public static void addTokenToSentence(LinkedVector sentence, String token, String tag) { + public static void addTokenToSentence(LinkedVector sentence, String token, String tag, ParametersForLbjCode prs) { NEWord word = new NEWord(new Word(token), null, tag); + word.params = prs; addTokenToSentence(sentence, word); } public static void addTokenToSentence(LinkedVector sentence, NEWord word) { Vector v = NEWord.splitWord(word); - if (ParametersForLbjCode.currentParameters.tokenizationScheme + if (word.params.tokenizationScheme .equals(TokenizationScheme.DualTokenizationScheme)) { sentence.add(word); word.parts = new String[v.size()]; for (int j = 0; j < v.size(); j++) word.parts[j] = v.elementAt(j).form; } else { - if (ParametersForLbjCode.currentParameters.tokenizationScheme + if (word.params.tokenizationScheme .equals(TokenizationScheme.LbjTokenizationScheme)) { for (int j = 0; j < v.size(); j++) sentence.add(v.elementAt(j)); } else { System.err .println("Fatal error in BracketFileManager.readAndAnnotate - unrecognized tokenization scheme: " - + ParametersForLbjCode.currentParameters.tokenizationScheme); + + word.params.tokenizationScheme); System.exit(0); } } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java index 6c738a64a..531841443 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java @@ -52,8 +52,8 @@ public class Parameters { * @param rm a populated ResourceManager passed as argument to * {@link #readAndLoadConfig readAndLoadConfig} */ - public static void readConfigAndLoadExternalData(ResourceManager rm) { - ParametersForLbjCode.currentParameters = readAndLoadConfig(rm, false); + public static ParametersForLbjCode readConfigAndLoadExternalData(ResourceManager rm) { + return readAndLoadConfig(rm, false); } @@ -66,9 +66,10 @@ public static void readConfigAndLoadExternalData(ResourceManager rm) { * @param areWeTraining this value determines whether or not this run will involve training a * model. If we are training, then we make sure there exists a folder in which to put the * trained model. If not, then we make sure the model exists. + * @return return the parameters instance * @throws IOException if the ResourceManager doesn't load correctly. */ - public static void readConfigAndLoadExternalData(String configFile, boolean areWeTraining) + public static ParametersForLbjCode readConfigAndLoadExternalData(String configFile, boolean areWeTraining) throws IOException { ResourceManager rm = new ResourceManager(configFile); String modelName = rm.getString("modelName"); @@ -81,19 +82,13 @@ public static void readConfigAndLoadExternalData(String configFile, boolean areW // settings switch (modelName) { case ViewNames.NER_CONLL: - ParametersForLbjCode.currentParameters = - readAndLoadConfig(baseConfigurator.getConfig(nonDefaultProps), + return readAndLoadConfig(baseConfigurator.getConfig(nonDefaultProps), areWeTraining); - break; case ViewNames.NER_ONTONOTES: - ParametersForLbjCode.currentParameters = - readAndLoadConfig(baseConfigurator.getConfig(ontonotesConfigurator + return readAndLoadConfig(baseConfigurator.getConfig(ontonotesConfigurator .getConfig(nonDefaultProps)), areWeTraining); - break; default: - ParametersForLbjCode.currentParameters = - readAndLoadConfig(baseConfigurator.getConfig(rm), areWeTraining); - break; + return readAndLoadConfig(baseConfigurator.getConfig(rm), areWeTraining); } } @@ -140,9 +135,6 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean if (rm.containsKey("language")) { Language lang = Language.getLanguageByCode(rm.getString("language")); param.language = lang; - - // becuase it is used in initializing tree gazetteers - ParametersForLbjCode.currentParameters.language = lang; } if (rm.containsKey("labelsToAnonymizeInEvaluation")) { @@ -183,7 +175,7 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean // models, never // training aux.nameAsAuxFeature = auxModels[i + 1]; - loadClassifierModels(aux); + loadClassifierModels(aux, param); param.auxiliaryModels.addElement(aux); } } @@ -258,20 +250,19 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean param.featuresToUse.put("TitleNormalization", true); param.featuresToUse.put("WordTopicTitleInfo", true); - // Conditional Features section - // GazetteersFeatures + // if enabled, load up the gazetteers. if (rm.containsKey("GazetteersFeatures") && rm.getString("GazetteersFeatures").equals("1")) { String pathToGazetteersLists = rm.getString("pathToGazetteersLists"); if (rm.containsKey("FlatGazetteers") && Boolean.parseBoolean(rm.getString("FlatGazetteers"))) { logger.info("Loading FlatGazetteers"); - GazetteersFactory.init(5, pathToGazetteersLists, true); + param.gazetteers = GazetteersFactory.get(5, pathToGazetteersLists, true, param.language); } else { int maxPhraseLength = 5; if (rm.containsKey(NerBaseConfigurator.PHRASE_LENGTH)) maxPhraseLength = rm.getInt(NerBaseConfigurator.PHRASE_LENGTH); - GazetteersFactory.init(maxPhraseLength, pathToGazetteersLists, false); + param.gazetteers = GazetteersFactory.get(maxPhraseLength, pathToGazetteersLists, false, param.language); } } @@ -324,7 +315,7 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean } - // BrownClusterPaths feature + // If enabled, load up the brown clusters String brownDebug = ""; if (rm.containsKey("BrownClusterPaths") && rm.getString("BrownClusterPaths").equals("1")) { @@ -350,8 +341,8 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean && rm.getString("UseLocalBrownCluster").equals("true")){ useLocalBrownCluster = true; } - BrownClusters.init(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, - lowercaseBrown, useLocalBrownCluster); + param.brownClusters = BrownClusters.get(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, + lowercaseBrown); // For output later for (int i = 0; i < pathsToBrownClusters.size(); i++) { @@ -391,16 +382,14 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean return param; } - - - public static void loadClassifierModels(ParametersForLbjCode config) { - if (ParametersForLbjCode.currentParameters.debug) { + public static void loadClassifierModels(ParametersForLbjCode config, ParametersForLbjCode outter) { + if (outter.debug) { logger.debug("Reading the model at: " + config.pathToModelFile + ".level1"); } config.taggerLevel1 = new NETaggerLevel1(config.pathToModelFile + ".level1", config.pathToModelFile + ".level1.lex"); - if (ParametersForLbjCode.currentParameters.debug) { + if (outter.debug) { logger.debug("Reading the model at: " + config.pathToModelFile + ".level2"); } config.taggerLevel2 = diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java index c5d8f440d..48dca904a 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java @@ -9,27 +9,33 @@ import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.lbjava.learn.SparseNetworkLearner; +import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters; +import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers; import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator; import java.util.HashMap; import java.util.Vector; public class ParametersForLbjCode { - // The only static member in the class - public static ParametersForLbjCode currentParameters = new ParametersForLbjCode(); - // Enums - // TODO: fix or remove DualTokenizationScheme + /** Enums + TODO: fix or remove DualTokenizationScheme*/ public enum TokenizationScheme { LbjTokenizationScheme, DualTokenizationScheme } - // Optional / predefined features + /** this is the gazetteers if we are using them, or null if not. */ + public Gazetteers gazetteers = null; + + /** the brown clusters, or null if disabled. */ + public BrownClusters brownClusters = null; + + /** Optional / predefined features // This is necessary for brackets file reader - // will be initialized to something like {"PER","ORG","LOC","MISC"}; + // will be initialized to something like {"PER","ORG","LOC","MISC"}; */ public String[] labelTypes = {"PER", "ORG", "LOC", "MISC"}; - // Labels to ignore when evaluating model performance, e.g. "MISC" for the MUC7 dataset. + /** Labels to ignore when evaluating model performance, e.g. "MISC" for the MUC7 dataset. */ public Vector labelsToIgnoreInEvaluation = null; // Labels to evaluate only for having found an NE regardless of the label found. diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ModelLoader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ModelLoader.java index a968543a1..04cccd0f5 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ModelLoader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ModelLoader.java @@ -39,11 +39,11 @@ public class ModelLoader { * @param rm the resource manager. * @param training if we are training. * @param viewName the name of the view identifies the model. + * @param cp the parameters for the calling model. */ - static public void load(ResourceManager rm, String viewName, boolean training) { + static public void load(ResourceManager rm, String viewName, boolean training, ParametersForLbjCode cp) { // the loaded built into the model will check the local file system and the jar files in the classpath. - ParametersForLbjCode cp = ParametersForLbjCode.currentParameters; String modelPath = cp.pathToModelFile; java.io.File modelFile = new File(modelPath + ".level1"); NETaggerLevel1 tagger1 = null; @@ -109,15 +109,15 @@ static public void load(ResourceManager rm, String viewName, boolean training) { model = modelDir.getPath() + "/model/OntoNotes.model"; } tagger1 = new NETaggerLevel1(model + ".level1", model + ".level1.lex"); - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1")) { + if (cp.featuresToUse.containsKey("PredictionsLevel1")) { tagger2 = new NETaggerLevel2(model + ".level2", model + ".level2.lex"); } } catch (InvalidPortException | DatastoreException | InvalidEndpointException e) { e.printStackTrace(); } } - ParametersForLbjCode.currentParameters.taggerLevel1 = tagger1; - ParametersForLbjCode.currentParameters.taggerLevel2 = tagger2; + cp.taggerLevel1 = tagger1; + cp.taggerLevel2 = tagger2; } /** diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java index 977b98ccc..239c29e05 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java @@ -48,11 +48,10 @@ public class NERAnnotator extends Annotator { /** our specific logger. */ private final Logger logger = LoggerFactory.getLogger(NERAnnotator.class); - /** the level one model. */ - public SparseNetworkLearner taggerLevel1; - - /** the level two model. */ - public SparseNetworkLearner taggerLevel2; + /** params were once static, preventing mult-model runtimes, but now are stored here. Params + * include the models, gazetteers and brown clusters. + */ + private ParametersForLbjCode params = null; /** * @param nonDefaultConfigValues a configuration file specifying non-default parameters for the @@ -107,15 +106,12 @@ public void initialize(ResourceManager nerRm) { nerRm = new NerOntonotesConfigurator().getConfig(nerRm); else nerRm = new NerBaseConfigurator().getConfig(nerRm); - ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks = false; - Parameters.readConfigAndLoadExternalData(nerRm); - + this.params = Parameters.readConfigAndLoadExternalData(nerRm); + this.params.forceNewSentenceOnLineBreaks = false; // load the models. synchronized (LOADING_MODELS) { - ModelLoader.load(nerRm, viewName, false); - this.taggerLevel1 = ParametersForLbjCode.currentParameters.taggerLevel1; - this.taggerLevel2 = ParametersForLbjCode.currentParameters.taggerLevel2; - } + ModelLoader.load(nerRm, viewName, false, this.params); + } } /** @@ -136,7 +132,7 @@ public void addView(TextAnnotation ta) { LinkedVector words = new LinkedVector(); for (String w : wtoks) { if (w.length() > 0) { - NEWord.addTokenToSentence(words, w, "unlabeled"); + NEWord.addTokenToSentence(words, w, "unlabeled", this.params); tokenindices[neWordIndex] = tokenIndex; neWordIndex++; } else { @@ -151,9 +147,8 @@ public void addView(TextAnnotation ta) { // Do the annotation. Data data = new Data(new NERDocument(sentences, "input")); try { - ExpressiveFeaturesAnnotator.annotate(data); - Decoder.annotateDataBIO(data, (NETaggerLevel1) taggerLevel1, - (NETaggerLevel2) taggerLevel2); + ExpressiveFeaturesAnnotator.annotate(data, this.params); + Decoder.annotateDataBIO(data, params); } catch (Exception e) { logger.error("Cannot annotate the text, the exception was: ", e); return; @@ -241,7 +236,7 @@ public Set getTagValues() { if (!isInitialized()) { doInitialize(); } - Lexicon labelLexicon = taggerLevel1.getLabelLexicon(); + Lexicon labelLexicon = this.params.taggerLevel1.getLabelLexicon(); Set tagSet = new HashSet(); for (int i =0; i < labelLexicon.size(); ++i) { tagSet.add(labelLexicon.lookupKey(i).getStringValue()); diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java index fe16d315f..bf254f0db 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java @@ -190,74 +190,70 @@ public boolean accept(File dir, String name) { }); for (String confFile : configfiles) { confFile = dir + "/config/" + confFile; + ParametersForLbjCode prms = null; if (!skiptraining) { if (trainDir.exists() && testDir.exists() && devDir.exists()) { System.out.println("\n\n----- Training models for evaluation for "+confFile+" ------"); - Parameters.readConfigAndLoadExternalData(confFile, true); + prms = Parameters.readConfigAndLoadExternalData(confFile, true); ResourceManager rm = new ResourceManager(confFile); - ModelLoader.load(rm, rm.getString("modelName"), true); - NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1; - NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2; + ModelLoader.load(rm, rm.getString("modelName"), true, prms); + NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) prms.taggerLevel1; + NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) prms.taggerLevel2; SparseAveragedPerceptron sap1 = (SparseAveragedPerceptron)taggerLevel1.getBaseLTU(); - sap1.setLearningRate(ParametersForLbjCode.currentParameters.learningRatePredictionsLevel1); - sap1.setThickness(ParametersForLbjCode.currentParameters.thicknessPredictionsLevel1); + sap1.setLearningRate(prms.learningRatePredictionsLevel1); + sap1.setThickness(prms.thicknessPredictionsLevel1); System.out.println("L1 learning rate = "+sap1.getLearningRate()+", thickness = "+sap1.getPositiveThickness()); - if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1")) { + if (prms.featuresToUse.containsKey("PredictionsLevel1")) { SparseAveragedPerceptron sap2 = (SparseAveragedPerceptron)taggerLevel2.getBaseLTU(); - sap2.setLearningRate(ParametersForLbjCode.currentParameters.learningRatePredictionsLevel2); - sap2.setThickness(ParametersForLbjCode.currentParameters.thicknessPredictionsLevel2); + sap2.setLearningRate(prms.learningRatePredictionsLevel2); + sap2.setThickness(prms.thicknessPredictionsLevel2); System.out.println("L2 learning rate = "+sap2.getLearningRate()+", thickness = "+sap2.getPositiveThickness()); } // there is a training directory, with training enabled, so train. We use the same dataset // for both training and evaluating. - LearningCurveMultiDataset.getLearningCurve(iterations, trainDirName, devDirName, incremental); + LearningCurveMultiDataset.getLearningCurve(iterations, trainDirName, devDirName, incremental, prms); System.out.println("\n\n----- Final results for "+confFile+", verbose ------"); NETesterMultiDataset.test(testDirName, true, - ParametersForLbjCode.currentParameters.labelsToIgnoreInEvaluation, - ParametersForLbjCode.currentParameters.labelsToAnonymizeInEvaluation); + prms.labelsToIgnoreInEvaluation, prms.labelsToAnonymizeInEvaluation, prms); System.out.println("\n\n----- Final results for "+confFile+", F1 only ------"); NETesterMultiDataset.test(testDirName, false, - ParametersForLbjCode.currentParameters.labelsToIgnoreInEvaluation, - ParametersForLbjCode.currentParameters.labelsToAnonymizeInEvaluation); + prms.labelsToIgnoreInEvaluation, prms.labelsToAnonymizeInEvaluation, prms); } else { System.out.println("Training requires a \"train\", \"test\" and \"dev\" subdirectory, " + "not so within "+dir+", skipping that directory."); } } else if (!release) { System.out.println("\n\n----- Reporting results from existing models for "+confFile+" ------"); - Parameters.readConfigAndLoadExternalData(confFile, !skiptraining); + prms = Parameters.readConfigAndLoadExternalData(confFile, !skiptraining); ResourceManager rm = new ResourceManager(confFile); - ModelLoader.load(rm, rm.getString("modelName"), !skiptraining); + ModelLoader.load(rm, rm.getString("modelName"), !skiptraining, prms); System.out.println("Benchmark against configuration : " + confFile); if (reportLabels) - NEDisplayPredictions.test(testDirName, "-c", verbose); + NEDisplayPredictions.test(testDirName, "-c", verbose, prms); else if (reportFeatures) - NETesterMultiDataset.dumpFeaturesLabeledData(testDirName, output); + NETesterMultiDataset.dumpFeaturesLabeledData(testDirName, output, prms); else NETesterMultiDataset.test(testDirName, verbose, - ParametersForLbjCode.currentParameters.labelsToIgnoreInEvaluation, - ParametersForLbjCode.currentParameters.labelsToAnonymizeInEvaluation); + prms.labelsToIgnoreInEvaluation, prms.labelsToAnonymizeInEvaluation, prms); } if (release) { if (trainDir.exists() && testDir.exists() && devDir.exists()) { - Parameters.readConfigAndLoadExternalData(confFile, true); + prms = Parameters.readConfigAndLoadExternalData(confFile, true); ResourceManager rm = new ResourceManager(confFile); - ModelLoader.load(rm, rm.getString("modelName"), true); + ModelLoader.load(rm, rm.getString("modelName"), true, prms); System.out.println("\n\n----- Building a final model for "+confFile+" ------"); // there is a training directory, with training enabled, so train. We use the same dataset // for both training and evaluating. - LearningCurveMultiDataset.buildFinalModel(iterations, trainDirName, testDirName, devDirName, incremental); + LearningCurveMultiDataset.buildFinalModel(iterations, trainDirName, testDirName, devDirName, incremental, prms); System.out.println("\n\n----- Release results for "+confFile+", verbose ------"); NETesterMultiDataset.test(devDirName, true, - ParametersForLbjCode.currentParameters.labelsToIgnoreInEvaluation, - ParametersForLbjCode.currentParameters.labelsToAnonymizeInEvaluation); + prms.labelsToIgnoreInEvaluation, prms.labelsToAnonymizeInEvaluation, prms); System.out.println("\n\n----- Release results for "+confFile+", F1 only ------"); NETesterMultiDataset.test(devDirName, false, - ParametersForLbjCode.currentParameters.labelsToIgnoreInEvaluation, - ParametersForLbjCode.currentParameters.labelsToAnonymizeInEvaluation); + prms.labelsToIgnoreInEvaluation, prms.labelsToAnonymizeInEvaluation, prms); } else { System.out.println("Building a final model requires a \"train\", \"test\" and \"dev\" subdirectory, " + "not so within "+dir+", skipping that directory."); diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java index 4e1280718..50a1008ec 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java @@ -31,11 +31,11 @@ public static void main(String[] args) { System.exit(-1); } - ParametersForLbjCode cp = ParametersForLbjCode.currentParameters; + ParametersForLbjCode cp = null; try { boolean areWeTraining = args[0].equalsIgnoreCase("-train"); ResourceManager rm = new ResourceManager(args[args.length - 1]); - Parameters.readConfigAndLoadExternalData(args[args.length - 1], areWeTraining); + cp = Parameters.readConfigAndLoadExternalData(args[args.length - 1], areWeTraining); if (args[0].equalsIgnoreCase("-train")) { String dataFormat; // config file is always the last one. @@ -44,15 +44,14 @@ public static void main(String[] args) { }else{ dataFormat = args[3]; } - LearningCurveMultiDataset.getLearningCurve(-1, dataFormat, args[1], args[2], false); + LearningCurveMultiDataset.getLearningCurve(-1, dataFormat, args[1], args[2], false, cp); }else if (args[0].equalsIgnoreCase("-trainFixedIterations")) - LearningCurveMultiDataset.getLearningCurve(Integer.parseInt(args[1]), args[2], args[3], false); + LearningCurveMultiDataset.getLearningCurve(Integer.parseInt(args[1]), args[2], args[3], false, cp); else { // load up the models - ModelLoader.load(rm, rm.getString("modelName"), false); + ModelLoader.load(rm, rm.getString("modelName"), false, cp); if (args[0].equalsIgnoreCase("-annotate")) { - NETagPlain.init(); - NETagPlain.tagData(args[1], args[2]); + NETagPlain.tagData(args[1], args[2], cp); } if (args[0].equalsIgnoreCase("-demo")) { String input = ""; @@ -61,8 +60,7 @@ public static void main(String[] args) { if (input.equalsIgnoreCase("quit")) System.exit(0); String res = NETagPlain.tagLine(input, - (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1, - (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2); + (NETaggerLevel1) cp.taggerLevel1, (NETaggerLevel2) cp.taggerLevel2, cp); res = NETagPlain.insertHtmlColors(res); StringTokenizer st = new StringTokenizer(res); StringBuilder output = new StringBuilder(); @@ -82,10 +80,10 @@ public static void main(String[] args) { dataFormat = args[2]; } NETesterMultiDataset.test(args[1], true, dataFormat, cp.labelsToIgnoreInEvaluation, - cp.labelsToAnonymizeInEvaluation); + cp.labelsToAnonymizeInEvaluation, cp); } if (args[0].equalsIgnoreCase("-dumpFeatures")) - NETesterMultiDataset.dumpFeaturesLabeledData(args[1], args[2]); + NETesterMultiDataset.dumpFeaturesLabeledData(args[1], args[2], cp); } } catch (Exception e) { logger.error("Exception caught: "); diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BracketFileReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BracketFileReader.java index ce2ad7112..132fd8f26 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BracketFileReader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BracketFileReader.java @@ -22,26 +22,26 @@ class BracketFileReader { private static Logger logger = LoggerFactory.getLogger(BracketFileReader.class); - public static NERDocument read(String fileName, String docname) throws Exception { + public static NERDocument read(String fileName, String docname, ParametersForLbjCode cp) throws Exception { logger.info("Reading the file: " + fileName); - String annotatedText = PlainTextReader.normalizeText(getFileText(fileName)); - return parseTextWithBrackets(annotatedText, docname); + String annotatedText = PlainTextReader.normalizeText(getFileText(fileName), cp); + return parseTextWithBrackets(annotatedText, docname, cp); } - public static NERDocument parseTextWithBrackets(String annotatedText, String docname) + public static NERDocument parseTextWithBrackets(String annotatedText, String docname, ParametersForLbjCode cp) throws Exception { if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0) return new NERDocument(new ArrayList(), docname); Vector bracketTokens = new Vector<>();// can include newlines!!!! Vector bracketTokensTags = new Vector<>(); - parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens); + parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens, cp); StringBuilder buff = new StringBuilder(bracketTokens.size() * 20); for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" "); // the tokens below will have no newline characters. // logger.info("Raw text: "+buff); Vector> parsedTokens = - PlainTextReader.sentenceSplitAndTokenizeText(buff.toString()); + PlainTextReader.sentenceSplitAndTokenizeText(buff.toString(), cp); // now we need to align the bracket tokens to the sentence split and tokenized tokens. // there are two issues to be careful with - // 1) The bracket tokens may have newline characters as individual tokens, the others will @@ -92,7 +92,7 @@ public static NERDocument parseTextWithBrackets(String annotatedText, String doc LinkedVector sentence = new LinkedVector(); for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) { NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), - parsedTokensTagsFlat.elementAt(parsedTokensPos)); + parsedTokensTagsFlat.elementAt(parsedTokensPos), cp); parsedTokensPos++; } res.add(sentence); @@ -100,50 +100,48 @@ public static NERDocument parseTextWithBrackets(String annotatedText, String doc return new NERDocument(res, docname); } - /* + /** * note that this one will do very little normalization/tokenization and token splitting. these * fancy stuff is done after we get the brackets files tokens and tags. it is important however * to keep the newline token to know where to split the sentences if we trust newlines as new * sentence starts. */ public static void parseBracketsAnnotatedText(String text, Vector tags, - Vector words) { + Vector words, ParametersForLbjCode cp) { // Add spaces before and after each bracket, except for after open bracket [ text = text.replace("]", " ] "); - for (int i = 0; i < ParametersForLbjCode.currentParameters.labelTypes.length; i++) + for (int i = 0; i < cp.labelTypes.length; i++) text = - text.replace("[" + ParametersForLbjCode.currentParameters.labelTypes[i], " [" - + ParametersForLbjCode.currentParameters.labelTypes[i] + " "); - + text.replace("[" + cp.labelTypes[i], " [" + cp.labelTypes[i] + " "); Vector tokens = new Vector<>(); - text = PlainTextReader.normalizeText(text); + text = PlainTextReader.normalizeText(text, cp); StringTokenizer stLines = new StringTokenizer(text, "\n"); while (stLines.hasMoreTokens()) { String line = stLines.nextToken(); StringTokenizer st = new StringTokenizer(line, " \t"); while (st.hasMoreTokens()) tokens.addElement(st.nextToken()); - if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks - || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) + if (cp.forceNewSentenceOnLineBreaks + || cp.keepOriginalFileTokenizationAndSentenceSplitting) tokens.addElement("\n"); } for (int i = 0; i < tokens.size(); i++) { boolean added = false; - for (int labelType = 0; labelType < ParametersForLbjCode.currentParameters.labelTypes.length; labelType++) { + for (int labelType = 0; labelType < cp.labelTypes.length; labelType++) { if (tokens.elementAt(i).equals( - "[" + ParametersForLbjCode.currentParameters.labelTypes[labelType])) { + "[" + cp.labelTypes[labelType])) { i++; boolean first = true; while (!tokens.elementAt(i).equals("]")) { words.addElement(tokens.elementAt(i)); if (first) { tags.addElement("B-" - + ParametersForLbjCode.currentParameters.labelTypes[labelType]); + + cp.labelTypes[labelType]); first = false; } else { tags.addElement("I-" - + ParametersForLbjCode.currentParameters.labelTypes[labelType]); + + cp.labelTypes[labelType]); } i++; } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BuildEvaluationFiles.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BuildEvaluationFiles.java index 8659f9d20..85e8aafc6 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BuildEvaluationFiles.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/BuildEvaluationFiles.java @@ -8,6 +8,7 @@ package edu.illinois.cs.cogcomp.ner.ParsingProcessingData; import edu.illinois.cs.cogcomp.ner.IO.OutFile; +import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; import edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,9 +42,10 @@ public static void buildEvaluationFile(String[] goldFiles, String[] taggedFiles, public static void appendToEvaluationFile(String goldFile, String taggedFile, OutFile outPhrase, OutFile outToken) { + ParametersForLbjCode cp = new ParametersForLbjCode(); Vector goldTags = new Vector<>(); Vector goldWords = new Vector<>(); - BracketFileReader.parseBracketsAnnotatedText(goldFile, goldTags, goldWords); + BracketFileReader.parseBracketsAnnotatedText(goldFile, goldTags, goldWords, cp); Vector tempgoldTags = new Vector<>(); Vector tempgoldWords = new Vector<>(); Hashtable newlines = new Hashtable<>(); @@ -62,7 +64,7 @@ public static void appendToEvaluationFile(String goldFile, String taggedFile, Vector resTags = new Vector<>(); Vector resWords = new Vector<>(); - BracketFileReader.parseBracketsAnnotatedText(taggedFile, resTags, resWords); + BracketFileReader.parseBracketsAnnotatedText(taggedFile, resTags, resWords, cp); Vector tempresTags = new Vector<>(); Vector tempresWords = new Vector<>(); for (int i = 0; i < resWords.size(); i++) { diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java index 5f7a1025b..0e283875a 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java @@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument; import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; +import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; import edu.illinois.cs.cogcomp.lbjava.nlp.ColumnFormat; import edu.illinois.cs.cogcomp.lbjava.nlp.Word; import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; @@ -19,10 +20,11 @@ class ColumnFileReader extends ColumnFormat { String filename = null; - - public ColumnFileReader(String file) { + ParametersForLbjCode params = null; + public ColumnFileReader(String file, ParametersForLbjCode params) { super(file); filename = file; + this.params = params; } int linec = 0; @@ -52,7 +54,7 @@ public Object next() { LinkedVector res = new LinkedVector(); NEWord w = new NEWord(new Word(token, pos), null, label); - NEWord.addTokenToSentence(res, w.form, w.neLabel); + NEWord.addTokenToSentence(res, w.form, w.neLabel, params); for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) { linec++; @@ -72,7 +74,7 @@ public Object next() { continue; } w = new NEWord(new Word(token, pos), null, label); - NEWord.addTokenToSentence(res, w.form, w.neLabel); + NEWord.addTokenToSentence(res, w.form, w.neLabel, params); } if (res.size() == 0) return null; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/PlainTextReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/PlainTextReader.java index e6163d06d..b839961c6 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/PlainTextReader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/PlainTextReader.java @@ -20,7 +20,7 @@ import java.util.Vector; public class PlainTextReader { - public static ArrayList parsePlainTextFile(String file) { + public static ArrayList parsePlainTextFile(String file, ParametersForLbjCode cp) { InFile in = new InFile(file); String line = in.readLine(); StringBuilder buf = new StringBuilder(100000); @@ -30,17 +30,17 @@ public static ArrayList parsePlainTextFile(String file) { } buf.append(" "); in.close(); - return parseText(normalizeText(buf.toString())); + return parseText(normalizeText(buf.toString(), cp), cp); } - public static ArrayList parseText(String text) { - Vector> processed = sentenceSplitAndTokenizeText(text); + public static ArrayList parseText(String text, ParametersForLbjCode cp) { + Vector> processed = sentenceSplitAndTokenizeText(text, cp); ArrayList res = new ArrayList<>(); for (int i = 0; i < processed.size(); i++) { LinkedVector sentence = new LinkedVector(); for (int j = 0; j < processed.elementAt(i).size(); j++) NEWord.addTokenToSentence(sentence, processed.elementAt(i).elementAt(j), - "unlabeled"); + "unlabeled", cp); res.add(sentence); } TaggedDataReader.connectSentenceBoundaries(res); @@ -55,14 +55,14 @@ public static ArrayList parseText(String text) { * @param text the text to parse. * @return a list of sentences represented as an array of words. */ - public static List parseTextRaw(String text) { - text = normalizeText(text); + public static List parseTextRaw(String text, ParametersForLbjCode cp) { + text = normalizeText(text, cp); ArrayList sentences1 = new ArrayList<>();// sentences split by newlines. will keep // just one element- the text if no // sentence splitting on newlines is // used... - if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks - || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) { + if (cp.forceNewSentenceOnLineBreaks + || cp.keepOriginalFileTokenizationAndSentenceSplitting) { StringTokenizer st = new StringTokenizer(text, "\n"); while (st.hasMoreTokens()) sentences1.add(st.nextToken()); @@ -71,7 +71,7 @@ public static List parseTextRaw(String text) { ArrayList sentences2 = new ArrayList<>();// we add Lbj sentence splitting on // top. - if (!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) { + if (!cp.keepOriginalFileTokenizationAndSentenceSplitting) { for (String aSentences1 : sentences1) { SentenceSplitter parser = new SentenceSplitter(new String[] {aSentences1}); Sentence s = (Sentence) parser.next(); @@ -90,14 +90,14 @@ public static List parseTextRaw(String text) { // adding the space before the final period in the sentence, // this is just a formatting issue with LBJ sentence splitter that can happen if (sentenceText.charAt(sentenceText.length() - 1) == '.' - && !ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) + && !cp.keepOriginalFileTokenizationAndSentenceSplitting) sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . "; // now tokenizing for real... String[] sentence = sentenceText.split("[ \\n\\t]"); if (sentence.length > 0) { // fixing a bug in LBJ sentence splitter if needed - if ((!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) + if ((!cp.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.length == 1 && res.size() > 0 && (sentence[0].equals("\"") || sentence[0].equals("''") || sentence[0] @@ -122,13 +122,12 @@ public static List parseTextRaw(String text) { } - public static Vector> sentenceSplitAndTokenizeText(String text) { - text = normalizeText(text); + public static Vector> sentenceSplitAndTokenizeText(String text, ParametersForLbjCode cp) { + text = normalizeText(text, cp); Vector sentences1 = new Vector<>();// sentences split by newlines. will keep just // one element- the text if no sentence splitting // on newlines is used... - if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks - || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) { + if (cp.forceNewSentenceOnLineBreaks || cp.keepOriginalFileTokenizationAndSentenceSplitting) { StringTokenizer st = new StringTokenizer(text, "\n"); while (st.hasMoreTokens()) sentences1.addElement(st.nextToken()); @@ -136,7 +135,7 @@ public static Vector> sentenceSplitAndTokenizeText(String text) { sentences1.addElement(text); Vector sentences2 = new Vector<>();// we add Lbj sentence splitting on top. - if (!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) { + if (!cp.keepOriginalFileTokenizationAndSentenceSplitting) { for (int i = 0; i < sentences1.size(); i++) { SentenceSplitter parser = new SentenceSplitter(new String[] {sentences1.elementAt(i)}); @@ -156,7 +155,7 @@ public static Vector> sentenceSplitAndTokenizeText(String text) { // adding the space before the final period in the sentence, // this is just a formatting issue with LBJ sentence splitter that can happen if (sentenceText.charAt(sentenceText.length() - 1) == '.' - && !ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) + && !cp.keepOriginalFileTokenizationAndSentenceSplitting) sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . "; // now tokenizing for real... StringTokenizer st = new StringTokenizer(sentenceText, " \n\t"); @@ -165,7 +164,7 @@ public static Vector> sentenceSplitAndTokenizeText(String text) { sentence.addElement(st.nextToken()); if (sentence.size() > 0) { // fixing a bug in LBJ sentence splitter if needed - if ((!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) + if ((!cp.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.size() == 1 && res.size() > 0 && (sentence.elementAt(0).equals("\"") @@ -180,8 +179,8 @@ public static Vector> sentenceSplitAndTokenizeText(String text) { return res; } - public static String normalizeText(String text) { - if (ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) + public static String normalizeText(String text, ParametersForLbjCode cp) { + if (cp.keepOriginalFileTokenizationAndSentenceSplitting) return text; StringBuilder buf = new StringBuilder((int) (text.length() * 1.2)); for (int i = 0; i < text.length(); i++) { @@ -220,15 +219,15 @@ public static String normalizeText(String text) { text = text.replace(";", " ; "); text = text.replace("]", " ] "); // now, I want to replace all '[' by ' [ ', but I have to be careful with chunk markers! - for (int i = 0; i < ParametersForLbjCode.currentParameters.labelTypes.length; i++) + for (int i = 0; i < cp.labelTypes.length; i++) text = - text.replace("[" + ParametersForLbjCode.currentParameters.labelTypes[i], - "_START_" + ParametersForLbjCode.currentParameters.labelTypes[i] + "_"); + text.replace("[" + cp.labelTypes[i], + "_START_" + cp.labelTypes[i] + "_"); text = text.replace("[", " [ "); - for (int i = 0; i < ParametersForLbjCode.currentParameters.labelTypes.length; i++) + for (int i = 0; i < cp.labelTypes.length; i++) text = - text.replace("_START_" + ParametersForLbjCode.currentParameters.labelTypes[i] - + "_", " [" + ParametersForLbjCode.currentParameters.labelTypes[i]); + text.replace("_START_" + cp.labelTypes[i] + + "_", " [" + cp.labelTypes[i]); text = text.replace(")", " ) "); text = text.replace("(", " ( "); text = text.replace("{", " { "); diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TaggedDataReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TaggedDataReader.java index 92fe313f2..e01eb0866 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TaggedDataReader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TaggedDataReader.java @@ -26,25 +26,25 @@ public class TaggedDataReader { private static Logger logger = LoggerFactory.getLogger(TaggedDataReader.class); public static NERDocument parseTextAnnotatedWithBrackets(String annotatedText, - String documentName) throws Exception { - return BracketFileReader.parseTextWithBrackets(annotatedText, documentName); + String documentName, ParametersForLbjCode cp) throws Exception { + return BracketFileReader.parseTextWithBrackets(annotatedText, documentName, cp); } - public static Vector readFolder(String path, String format) throws Exception { + public static Vector readFolder(String path, String format, ParametersForLbjCode cp) throws Exception { Vector res = new Vector<>(); String[] files = (new File(path)).list(); // sort the files so we can get deterministic order. - if (ParametersForLbjCode.currentParameters.sortLexicallyFilesInFolders) { + if (cp.sortLexicallyFilesInFolders) { Arrays.sort(files); } for (String file1 : files) { String file = path + "/" + file1; if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) { - res.addElement(readFile(file, format, file1)); + res.addElement(readFile(file, format, file1, cp)); } } - if (ParametersForLbjCode.currentParameters.treatAllFilesInFolderAsOneBigDocument) { + if (cp.treatAllFilesInFolderAsOneBigDocument) { // connecting sentence boundaries for (int i = 0; i < res.size() - 1; i++) { ArrayList ss1 = res.elementAt(i).sentences; @@ -66,13 +66,13 @@ public static Vector readFolder(String path, String format) throws return res; } - public static NERDocument readFile(String path, String format, String documentName) + public static NERDocument readFile(String path, String format, String documentName, ParametersForLbjCode cp) throws Exception { NERDocument res = null; if (format.equals("-c")) { - res = (new ColumnFileReader(path)).read(documentName); + res = (new ColumnFileReader(path, cp)).read(documentName); } else if (format.equals("-r")) { - res = BracketFileReader.read(path, documentName); + res = BracketFileReader.read(path, documentName, cp); }else if (format.equals("-json")) { TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(path, true); res = TextAnnotationConverter.getNerDocument(ta); diff --git a/ner/src/main/lbj/LbjTagger.lbj b/ner/src/main/lbj/LbjTagger.lbj index d3e5bc825..1077fc480 100644 --- a/ner/src/main/lbj/LbjTagger.lbj +++ b/ner/src/main/lbj/LbjTagger.lbj @@ -15,14 +15,14 @@ import edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.*; discrete% wordType(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WordTopicTitleInfo")){ + if(word.params.featuresToUse.containsKey("WordTopicTitleInfo")){ sense "" : WordTopicAndLayoutFeatures.getWordType(word); } } discrete% GazetteersFeatures(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("GazetteersFeatures")) + if(word.params.featuresToUse.containsKey("GazetteersFeatures")) { int i=0; NEWord w = word, last = (NEWord)word.next; @@ -44,7 +44,7 @@ discrete% GazetteersFeatures(NEWord word) <- real% WordEmbeddingFeatures(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WordEmbeddings")) + if(word.params.featuresToUse.containsKey("WordEmbeddings")) { int i; NEWord w = word, last = word; @@ -63,7 +63,7 @@ real% WordEmbeddingFeatures(NEWord word) <- discrete% WikifierFeatures(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WikifierFeatures")){ + if(word.params.featuresToUse.containsKey("WikifierFeatures")){ if(word.wikifierFeatures != null){ for(int i = 0; i < word.wikifierFeatures.length; i++){ @@ -83,7 +83,7 @@ discrete% IsSentenceStart(NEWord word) <- discrete% Forms(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("Forms")) + if(word.params.featuresToUse.containsKey("Forms")) { int i; NEWord w = word, last = word; @@ -105,9 +105,9 @@ discrete% Forms(NEWord word) <- // Problem 1 discrete% BrownClusterPaths(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("BrownClusterPaths")) + if(word.params.featuresToUse.containsKey("BrownClusterPaths")) { - BrownClusters bc = BrownClusters.get(); + BrownClusters bc = word.params.brownClusters; int i; NEWord w = word, last = word; for (i = 0; i <= 2 && last != null; ++i) last = (NEWord) last.next; @@ -126,8 +126,8 @@ discrete% BrownClusterPaths(NEWord word) <- // Problem 1 discrete% FormParts(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("Forms")&& - ParametersForLbjCode.currentParameters.tokenizationScheme.equals(ParametersForLbjCode.TokenizationScheme.DualTokenizationScheme)) + if(word.params.featuresToUse.containsKey("Forms")&& + word.params.tokenizationScheme.equals(ParametersForLbjCode.TokenizationScheme.DualTokenizationScheme)) { sense "0" : word.form; int i=-1; @@ -163,7 +163,7 @@ discrete% FormParts(NEWord word) <- // Feature set i discrete{false, true}% Capitalization(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("Capitalization")) + if(word.params.featuresToUse.containsKey("Capitalization")) { int i; NEWord w = word, last = word; @@ -177,7 +177,7 @@ discrete{false, true}% Capitalization(NEWord word) <- // Feature set ii discrete{false, true}% WordTypeInformation(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WordTypeInformation")) + if(word.params.featuresToUse.containsKey("WordTypeInformation")) { int i; NEWord w = word, last = word; @@ -204,7 +204,7 @@ discrete{false, true}% WordTypeInformation(NEWord word) <- // Feature set iii discrete% Affixes(NEWord word) <- { - if(ParametersForLbjCode.currentParameters.featuresToUse.containsKey("Affixes")) + if(word.params.featuresToUse.containsKey("Affixes")) { int N = word.form.length(); for (int i = 3; i <= 4; ++i) @@ -212,7 +212,7 @@ discrete% Affixes(NEWord word) <- for (int i = 1; i <= 4; ++i) if (word.form.length() > i) sense "s|" : word.form.substring(N - i); - if(ParametersForLbjCode.currentParameters.tokenizationScheme.equals(ParametersForLbjCode.TokenizationScheme.DualTokenizationScheme)) + if(word.params.tokenizationScheme.equals(ParametersForLbjCode.TokenizationScheme.DualTokenizationScheme)) for(int i=0;i sentences = PlainTextReader.parseText(TEST_INPUT); + ArrayList sentences = PlainTextReader.parseText(TEST_INPUT, params); Data data = new Data(new NERDocument(sentences, "input")); String output = null; try { - output = NETagPlain.tagData(data, t1, t2); + output = NETagPlain.tagData(data, params); } catch (Exception e) { logger.info("Cannot annotate the test, the exception was: "); e.printStackTrace(); diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reference/ReferenceUtils.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reference/ReferenceUtils.java index 918a070a9..ea5567050 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reference/ReferenceUtils.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reference/ReferenceUtils.java @@ -14,6 +14,7 @@ import edu.illinois.cs.cogcomp.ner.LbjTagger.Data; import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument; import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; +import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; import java.util.ArrayList; @@ -23,7 +24,7 @@ */ public class ReferenceUtils { - public Data createNerDataStructuresForText(TextAnnotation ta) { + public Data createNerDataStructuresForText(TextAnnotation ta, ParametersForLbjCode params) { ArrayList sentences = new ArrayList<>(); String[] tokens = ta.getTokens(); int[] tokenindices = new int[tokens.length]; @@ -35,7 +36,7 @@ public Data createNerDataStructuresForText(TextAnnotation ta) { LinkedVector words = new LinkedVector(); for (String w : wtoks) { if (w.length() > 0) { - NEWord.addTokenToSentence(words, w, "unlabeled"); + NEWord.addTokenToSentence(words, w, "unlabeled", params); tokenindices[neWordIndex] = tokenIndex; neWordIndex++; } else { diff --git a/relation-extraction/src/main/java/org/cogcomp/re/ACEMentionReader.java b/relation-extraction/src/main/java/org/cogcomp/re/ACEMentionReader.java index 896c47a17..4829d90b3 100644 --- a/relation-extraction/src/main/java/org/cogcomp/re/ACEMentionReader.java +++ b/relation-extraction/src/main/java/org/cogcomp/re/ACEMentionReader.java @@ -9,6 +9,7 @@ import edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator; import edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -68,10 +69,9 @@ public ACEMentionReader(String file, String type) { chunker.initialize(new ChunkerConfigurator().getDefaultConfig()); Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.6, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + Gazetteers gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); WordNetManager.loadConfigAsClasspathResource(true); WordNetManager wordNet = WordNetManager.getInstance(); - Gazetteers gazetteers = GazetteersFactory.get(); Properties stanfordProps = new Properties(); stanfordProps.put("annotators", "pos, parse"); stanfordProps.put("parse.originalDependencies", true); diff --git a/relation-extraction/src/main/java/org/cogcomp/re/ExampleUsage.java b/relation-extraction/src/main/java/org/cogcomp/re/ExampleUsage.java index 3118572a7..849383493 100644 --- a/relation-extraction/src/main/java/org/cogcomp/re/ExampleUsage.java +++ b/relation-extraction/src/main/java/org/cogcomp/re/ExampleUsage.java @@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder; import edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator; import edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -21,6 +22,7 @@ import edu.illinois.cs.cogcomp.pipeline.common.Stanford331Configurator; import edu.illinois.cs.cogcomp.pipeline.handlers.StanfordDepHandler; import edu.illinois.cs.cogcomp.pos.POSAnnotator; +//import edu.stanford.nlp.pipeline.CoreNLPProtos.Language; import edu.stanford.nlp.pipeline.POSTaggerAnnotator; import edu.stanford.nlp.pipeline.ParserAnnotator; import org.cogcomp.Datastore; @@ -104,6 +106,7 @@ public static void SemEvalAnnotate() { ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps); StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator); String modelPath = ""; + FlatGazetteers gazetteers = null; try { ta.addView(pos_annotator); chunker.addView(ta); @@ -112,7 +115,8 @@ public static void SemEvalAnnotate() { File model = ds.getDirectory("org.cogcomp.re", "SEMEVAL", 1.1, false); modelPath = model.getPath(); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = (FlatGazetteers) GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", + true, Language.English); WordNetManager.loadConfigAsClasspathResource(true); WordNetManager wordnet = WordNetManager.getInstance(); View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta); @@ -130,7 +134,6 @@ public static void SemEvalAnnotate() { catch (Exception e){ e.printStackTrace(); } - FlatGazetteers gazetteers = (FlatGazetteers)GazetteersFactory.get(); Constituent source = new Constituent("first", "Mention", ta, 0, 1); diff --git a/relation-extraction/src/main/java/org/cogcomp/re/PredictedMentionReader.java b/relation-extraction/src/main/java/org/cogcomp/re/PredictedMentionReader.java index a5c054812..6b8fd16c1 100644 --- a/relation-extraction/src/main/java/org/cogcomp/re/PredictedMentionReader.java +++ b/relation-extraction/src/main/java/org/cogcomp/re/PredictedMentionReader.java @@ -9,6 +9,7 @@ import edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator; import edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -46,10 +47,10 @@ public PredictedMentionReader(String path){ Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.6, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + Gazetteers gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", + true, Language.English); WordNetManager.loadConfigAsClasspathResource(true); WordNetManager wordNet = WordNetManager.getInstance(); - Gazetteers gazetteers = GazetteersFactory.get(); Properties stanfordProps = new Properties(); stanfordProps.put("annotators", "pos, parse"); stanfordProps.put("parse.originalDependencies", true); diff --git a/relation-extraction/src/main/java/org/cogcomp/re/RelationAnnotator.java b/relation-extraction/src/main/java/org/cogcomp/re/RelationAnnotator.java index b179ef314..58fd61284 100644 --- a/relation-extraction/src/main/java/org/cogcomp/re/RelationAnnotator.java +++ b/relation-extraction/src/main/java/org/cogcomp/re/RelationAnnotator.java @@ -9,6 +9,7 @@ import edu.illinois.cs.cogcomp.annotation.Annotator; import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; @@ -58,10 +59,10 @@ public void initialize(ResourceManager rm) { relationClassifier.readLexicon(lexFile); constrainedClassifier = new ACERelationConstrainedClassifier(relationClassifier); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.6, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", + true, Language.English); WordNetManager.loadConfigAsClasspathResource(true); wordNet = WordNetManager.getInstance(); - gazetteers = GazetteersFactory.get(); } catch (Exception e) { e.printStackTrace(); } diff --git a/relation-extraction/src/main/java/org/cogcomp/re/SemEvalMentionReader.java b/relation-extraction/src/main/java/org/cogcomp/re/SemEvalMentionReader.java index 06c87dff8..3a721418a 100644 --- a/relation-extraction/src/main/java/org/cogcomp/re/SemEvalMentionReader.java +++ b/relation-extraction/src/main/java/org/cogcomp/re/SemEvalMentionReader.java @@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder; import edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator; import edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator; +import edu.illinois.cs.cogcomp.core.constants.Language; import edu.illinois.cs.cogcomp.core.datastructures.IntPair; import edu.illinois.cs.cogcomp.core.datastructures.Pair; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; @@ -50,11 +51,10 @@ public void initExternalTools(){ _posAnnotator = new POSAnnotator(); Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - GazetteersFactory.init(5, gazetteersResource.getPath() + File.separator + "gazetteers", true); + _gazetteers = (FlatGazetteers)GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + + "gazetteers", true, Language.English); WordNetManager.loadConfigAsClasspathResource(true); _wordnet = WordNetManager.getInstance(); - _gazetteers = (FlatGazetteers)GazetteersFactory.get(); - __chunker = new ChunkerAnnotator(true); __chunker.initialize(new ChunkerConfigurator().getDefaultConfig()); From 1e896a66b961600ca0235664626c4f0d793e8f41 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Wed, 5 Sep 2018 09:00:14 -0500 Subject: [PATCH 2/3] Next version. --- big-data-utils/pom.xml | 4 ++-- chunker/pom.xml | 8 +++---- commasrl/pom.xml | 20 ++++++++-------- core-utilities/pom.xml | 2 +- corpusreaders/pom.xml | 6 ++--- curator/pom.xml | 4 ++-- dataless-classifier/pom.xml | 6 ++--- depparse/pom.xml | 12 +++++----- edison/pom.xml | 8 +++---- external/clausie/pom.xml | 4 ++-- external/external-commons/pom.xml | 6 ++--- external/path-lstm/pom.xml | 6 ++--- external/stanford_3.3.1/pom.xml | 6 ++--- external/stanford_3.8.0/pom.xml | 6 ++--- inference/pom.xml | 4 ++-- lbjava-nlp-tools/pom.xml | 4 ++-- lemmatizer/pom.xml | 6 ++--- md/pom.xml | 14 ++++++------ ner/pom.xml | 8 +++---- pipeline-client/pom.xml | 4 ++-- pipeline/pom.xml | 38 +++++++++++++++---------------- pom.xml | 2 +- pos/pom.xml | 2 +- prepsrl/pom.xml | 14 ++++++------ quantifier/pom.xml | 12 +++++----- question-type/pom.xml | 10 ++++---- relation-extraction/pom.xml | 18 +++++++-------- similarity/pom.xml | 6 ++--- temporal-normalizer/pom.xml | 14 ++++++------ tokenizer/pom.xml | 8 +++---- transliteration/pom.xml | 6 ++--- verbsense/pom.xml | 16 ++++++------- 32 files changed, 142 insertions(+), 142 deletions(-) diff --git a/big-data-utils/pom.xml b/big-data-utils/pom.xml index 7387d132d..309dccf1d 100644 --- a/big-data-utils/pom.xml +++ b/big-data-utils/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -23,7 +23,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 org.xeustechnologies.google-api diff --git a/chunker/pom.xml b/chunker/pom.xml index dca212c09..396126b5f 100644 --- a/chunker/pom.xml +++ b/chunker/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 @@ -24,12 +24,12 @@ edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp diff --git a/commasrl/pom.xml b/commasrl/pom.xml index 33b939ff0..1340e529a 100644 --- a/commasrl/pom.xml +++ b/commasrl/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -35,48 +35,48 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 true edu.illinois.cs.cogcomp illinois-curator - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-inference - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-ner - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-chunker - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp diff --git a/core-utilities/pom.xml b/core-utilities/pom.xml index a0a87217e..f8726c791 100644 --- a/core-utilities/pom.xml +++ b/core-utilities/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-core-utilities diff --git a/corpusreaders/pom.xml b/corpusreaders/pom.xml index cc04aabf1..d3bbe9716 100644 --- a/corpusreaders/pom.xml +++ b/corpusreaders/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-corpusreaders @@ -15,12 +15,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 org.slf4j diff --git a/curator/pom.xml b/curator/pom.xml index 9e7f6b8c3..3b418623c 100644 --- a/curator/pom.xml +++ b/curator/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-curator @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 diff --git a/dataless-classifier/pom.xml b/dataless-classifier/pom.xml index 1600e01ab..99bfb0302 100644 --- a/dataless-classifier/pom.xml +++ b/dataless-classifier/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -21,12 +21,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 org.slf4j diff --git a/depparse/pom.xml b/depparse/pom.xml index e5fd11c7b..273feda3d 100644 --- a/depparse/pom.xml +++ b/depparse/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-depparse @@ -16,27 +16,27 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-chunker - 4.0.12 + 4.0.13 diff --git a/edison/pom.xml b/edison/pom.xml index 8e65d94a1..9271957ba 100644 --- a/edison/pom.xml +++ b/edison/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-edison @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 @@ -80,13 +80,13 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-curator - 4.0.12 + 4.0.13 test diff --git a/external/clausie/pom.xml b/external/clausie/pom.xml index 214d4d137..aaf02b6e0 100644 --- a/external/clausie/pom.xml +++ b/external/clausie/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 ../../pom.xml @@ -24,7 +24,7 @@ edu.illinois.cs.cogcomp external-commons - 4.0.12 + 4.0.13 org.slf4j diff --git a/external/external-commons/pom.xml b/external/external-commons/pom.xml index c8ee868ed..16cb296eb 100644 --- a/external/external-commons/pom.xml +++ b/external/external-commons/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 org.cogcomp diff --git a/external/path-lstm/pom.xml b/external/path-lstm/pom.xml index 223ff6705..a8946df90 100644 --- a/external/path-lstm/pom.xml +++ b/external/path-lstm/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp external-commons - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 org.cogcomp diff --git a/external/stanford_3.3.1/pom.xml b/external/stanford_3.3.1/pom.xml index 86d5765bc..864dbab7c 100644 --- a/external/stanford_3.3.1/pom.xml +++ b/external/stanford_3.3.1/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 ../../pom.xml @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 @@ -36,7 +36,7 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 diff --git a/external/stanford_3.8.0/pom.xml b/external/stanford_3.8.0/pom.xml index b3c0352a0..db91e80d0 100644 --- a/external/stanford_3.8.0/pom.xml +++ b/external/stanford_3.8.0/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp external-commons - 4.0.12 + 4.0.13 org.slf4j diff --git a/inference/pom.xml b/inference/pom.xml index 824130ddd..25c638c45 100644 --- a/inference/pom.xml +++ b/inference/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 jar @@ -22,7 +22,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp diff --git a/lbjava-nlp-tools/pom.xml b/lbjava-nlp-tools/pom.xml index 5e706b556..38478310e 100644 --- a/lbjava-nlp-tools/pom.xml +++ b/lbjava-nlp-tools/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 LBJava-NLP-tools @@ -30,7 +30,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 org.slf4j diff --git a/lemmatizer/pom.xml b/lemmatizer/pom.xml index 533fef4f5..1a5f8fb40 100644 --- a/lemmatizer/pom.xml +++ b/lemmatizer/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-lemmatizer @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.stanford.nlp diff --git a/md/pom.xml b/md/pom.xml index 60d07dbfa..77059351d 100644 --- a/md/pom.xml +++ b/md/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -25,32 +25,32 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-ner - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.12 + 4.0.13 org.slf4j diff --git a/ner/pom.xml b/ner/pom.xml index fd8733f09..110eb0e26 100644 --- a/ner/pom.xml +++ b/ner/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-ner @@ -23,12 +23,12 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 org.cogcomp @@ -44,7 +44,7 @@ edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.12 + 4.0.13 org.slf4j diff --git a/pipeline-client/pom.xml b/pipeline-client/pom.xml index 37461f1ec..2b69f5f0f 100644 --- a/pipeline-client/pom.xml +++ b/pipeline-client/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-pipeline-client @@ -15,7 +15,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 5424d64ba..99c14ef07 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-nlp-pipeline @@ -16,57 +16,57 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-chunker - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-quantifier - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-prep-srl - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-comma - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-verbsense - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-question-typer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.12 + 4.0.13 org.cogcomp @@ -83,7 +83,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.12 + 4.0.13 org.apache.commons @@ -93,17 +93,17 @@ edu.illinois.cs.cogcomp illinois-md - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-relation-extraction - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-datalessclassification - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp @@ -131,7 +131,7 @@ edu.illinois.cs.cogcomp illinois-depparse - 4.0.12 + 4.0.13 @@ -149,12 +149,12 @@ edu.illinois.cs.cogcomp illinois-time - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-transliteration - 4.0.12 + 4.0.13 diff --git a/pom.xml b/pom.xml index e5db3763c..14d5e9de1 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp illinois-cogcomp-nlp pom - 4.0.12 + 4.0.13 core-utilities tokenizer diff --git a/pos/pom.xml b/pos/pom.xml index f10f81edb..b9b46c798 100644 --- a/pos/pom.xml +++ b/pos/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-pos diff --git a/prepsrl/pom.xml b/prepsrl/pom.xml index dd2969f0c..783dae1a9 100644 --- a/prepsrl/pom.xml +++ b/prepsrl/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -15,32 +15,32 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-depparse - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.12 + 4.0.13 diff --git a/quantifier/pom.xml b/quantifier/pom.xml index bc00e35d3..ebf3681be 100644 --- a/quantifier/pom.xml +++ b/quantifier/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-quantifier @@ -35,31 +35,31 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 compile edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 compile edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 compile edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 compile edu.illinois.cs.cogcomp illinois-curator - 4.0.12 + 4.0.13 compile diff --git a/question-type/pom.xml b/question-type/pom.xml index 7921f446a..df1c4de71 100644 --- a/question-type/pom.xml +++ b/question-type/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-question-typer @@ -13,22 +13,22 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pipeline-client - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp diff --git a/relation-extraction/pom.xml b/relation-extraction/pom.xml index 8cd175d58..ee52ae3c7 100644 --- a/relation-extraction/pom.xml +++ b/relation-extraction/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -33,42 +33,42 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-ner - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-md - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-chunker - 4.0.12 + 4.0.13 joda-time diff --git a/similarity/pom.xml b/similarity/pom.xml index de7691b62..4cda2b043 100644 --- a/similarity/pom.xml +++ b/similarity/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-similarity @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 org.slf4j @@ -55,7 +55,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp diff --git a/temporal-normalizer/pom.xml b/temporal-normalizer/pom.xml index a7678bb8f..bae48dc4a 100644 --- a/temporal-normalizer/pom.xml +++ b/temporal-normalizer/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 illinois-time @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp @@ -28,7 +28,7 @@ edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 org.apache.uima @@ -43,12 +43,12 @@ edu.illinois.cs.cogcomp illinois-chunker - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-curator - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp @@ -58,7 +58,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.12 + 4.0.13 test @@ -86,7 +86,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.12 + 4.0.13 diff --git a/tokenizer/pom.xml b/tokenizer/pom.xml index 683f9f469..c7a664c5c 100644 --- a/tokenizer/pom.xml +++ b/tokenizer/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 illinois-tokenizer @@ -15,17 +15,17 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-curator - 4.0.12 + 4.0.13 test diff --git a/transliteration/pom.xml b/transliteration/pom.xml index 04fb63691..2512edec3 100644 --- a/transliteration/pom.xml +++ b/transliteration/pom.xml @@ -5,7 +5,7 @@ http://www.w3.org/2001/XMLSchema-instance "> illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 @@ -20,12 +20,12 @@ http://www.w3.org/2001/XMLSchema-instance "> edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 org.apache.commons diff --git a/verbsense/pom.xml b/verbsense/pom.xml index 80b1285b2..18a55d7e3 100755 --- a/verbsense/pom.xml +++ b/verbsense/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.12 + 4.0.13 4.0.0 illinois-verbsense @@ -18,37 +18,37 @@ edu.illinois.cs.cogcomp illinois-edison - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-pos - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-ner - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-chunker - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp illinois-inference - 4.0.12 + 4.0.13 edu.illinois.cs.cogcomp From 653e4ff3c5342d39ac6aa22a36be1149e70b45c2 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Wed, 5 Sep 2018 12:48:43 -0500 Subject: [PATCH 3/3] Added a modicum of error handling to bits of code around what I touched. --- .../org/cogcomp/md/BIOCombinedReader.java | 57 +++++++++-------- .../main/java/org/cogcomp/md/BIOReader.java | 62 +++++++++++-------- .../main/java/org/cogcomp/md/BIOTester.java | 42 ++++++++++--- .../java/org/cogcomp/md/ExtentReader.java | 25 ++++++-- .../java/org/cogcomp/md/ExtentTester.java | 18 ++++-- 5 files changed, 133 insertions(+), 71 deletions(-) diff --git a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java index 7dfed869c..ba20ba8b3 100644 --- a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java @@ -20,7 +20,12 @@ import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader; import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader; import edu.illinois.cs.cogcomp.pos.POSAnnotator; +import io.minio.errors.InvalidEndpointException; +import io.minio.errors.InvalidPortException; +import net.didion.jwnl.JWNLException; + import org.cogcomp.Datastore; +import org.cogcomp.DatastoreException; import java.io.*; import java.util.*; @@ -46,8 +51,13 @@ public class BIOCombinedReader extends BIOReader { * @param mode Indicates the corpus and train/eval e.g. "ERE-TRAIN" * mode "ALL-TRAIN/EVAL" indicates hybrid corpus. * @param type Indicates the type (NAM/NOM/PRO/ALL) kept + * @throws DatastoreException + * @throws JWNLException + * @throws IOException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public BIOCombinedReader(int fold, String mode, String type){ + public BIOCombinedReader(int fold, String mode, String type) throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ _mode = mode; _type = type; _taOnly = false; @@ -58,7 +68,7 @@ public BIOCombinedReader(int fold, String mode, String type){ id = "Hybrid_" + fold; } - public BIOCombinedReader(int fold, String mode, String type, Boolean taOnly){ + public BIOCombinedReader(int fold, String mode, String type, Boolean taOnly) throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ _mode = mode; _type = type; _taOnly = taOnly; @@ -135,34 +145,29 @@ else if (mode.contains("ALL")){ } return ret; } - private List getTokensFromTAs(){ + private List getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException { List ret = new ArrayList<>(); WordNetManager wordNet = null; Gazetteers gazetteers = null; BrownClusters brownClusters = null; - try { - Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); - File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); - Vector bcs = new Vector<>(); - bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); - bcs.add("brown-clusters/brownBllipClusters"); - bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt"); - Vector bcst = new Vector<>(); - bcst.add(5); - bcst.add(5); - bcst.add(5); - Vector bcsl = new Vector<>(); - bcsl.add(false); - bcsl.add(false); - bcsl.add(false); - brownClusters = BrownClusters.get(bcs, bcst, bcsl); - WordNetManager.loadConfigAsClasspathResource(true); - wordNet = WordNetManager.getInstance(); - } - catch (Exception e){ - e.printStackTrace(); - } + Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); + File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); + Vector bcs = new Vector<>(); + bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); + bcs.add("brown-clusters/brownBllipClusters"); + bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt"); + Vector bcst = new Vector<>(); + bcst.add(5); + bcst.add(5); + bcst.add(5); + Vector bcsl = new Vector<>(); + bcsl.add(false); + bcsl.add(false); + bcsl.add(false); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); + WordNetManager.loadConfigAsClasspathResource(true); + wordNet = WordNetManager.getInstance(); for (TextAnnotation ta : currentTas){ View tokenView = ta.getView(ViewNames.TOKENS); String mentionViewName = ""; diff --git a/md/src/main/java/org/cogcomp/md/BIOReader.java b/md/src/main/java/org/cogcomp/md/BIOReader.java index d6a88273b..cd6afac91 100644 --- a/md/src/main/java/org/cogcomp/md/BIOReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOReader.java @@ -22,9 +22,15 @@ import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader; import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader; import edu.illinois.cs.cogcomp.pos.POSAnnotator; +import io.minio.errors.InvalidEndpointException; +import io.minio.errors.InvalidPortException; +import net.didion.jwnl.JWNLException; + import org.cogcomp.Datastore; +import org.cogcomp.DatastoreException; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Vector; @@ -71,8 +77,13 @@ public BIOReader(){ * "PRO" -> Only pronouns * "ALL" -> All mentions * @param isBIO Indicates if the tagging schema is "BIO" or "BIOLU" + * @throws JWNLException + * @throws IOException + * @throws DatastoreException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public BIOReader(String path, String mode, String type, Boolean isBIO){ + public BIOReader(String path, String mode, String type, Boolean isBIO) { _path = path; _mode = mode.split("-")[0]; _binary_indicator = mode.split("-")[1]; @@ -83,7 +94,11 @@ public BIOReader(String path, String mode, String type, Boolean isBIO){ id = group + "_" + type; taList = getTextAnnotations(); annotateTas(); - tokenList = getTokensFromTAs(); + try { + tokenList = getTokensFromTAs(); + } catch (Throwable t) { + throw new RuntimeException("Tokens could not be reproduced form the text annotations.",t); + } } public List getTextAnnotations(){ @@ -137,34 +152,29 @@ private void annotateTas(){ } } - private List getTokensFromTAs(){ + private List getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ List ret = new ArrayList<>(); WordNetManager wordNet = null; Gazetteers gazetteers = null; BrownClusters brownClusters = null; - try { - Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); - File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); - gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); - Vector bcs = new Vector<>(); - bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); - bcs.add("brown-clusters" + File.separator + "brownBllipClusters"); - bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt"); - Vector bcst = new Vector<>(); - bcst.add(5); - bcst.add(5); - bcst.add(5); - Vector bcsl = new Vector<>(); - bcsl.add(false); - bcsl.add(false); - bcsl.add(false); - brownClusters = BrownClusters.get(bcs, bcst, bcsl); - WordNetManager.loadConfigAsClasspathResource(true); - wordNet = WordNetManager.getInstance(); - } - catch (Exception e){ - e.printStackTrace(); - } + Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig()); + File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false); + gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English); + Vector bcs = new Vector<>(); + bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); + bcs.add("brown-clusters" + File.separator + "brownBllipClusters"); + bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt"); + Vector bcst = new Vector<>(); + bcst.add(5); + bcst.add(5); + bcst.add(5); + Vector bcsl = new Vector<>(); + bcsl.add(false); + bcsl.add(false); + bcsl.add(false); + brownClusters = BrownClusters.get(bcs, bcst, bcsl); + WordNetManager.loadConfigAsClasspathResource(true); + wordNet = WordNetManager.getInstance(); String mentionViewName = ""; if (_mode.equals("ACE05")){ diff --git a/md/src/main/java/org/cogcomp/md/BIOTester.java b/md/src/main/java/org/cogcomp/md/BIOTester.java index f68f0aa0a..2ef20b837 100644 --- a/md/src/main/java/org/cogcomp/md/BIOTester.java +++ b/md/src/main/java/org/cogcomp/md/BIOTester.java @@ -24,11 +24,17 @@ import edu.illinois.cs.cogcomp.nlp.corpusreaders.ACEReaderWithTrueCaseFixer; import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader; import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader; +import io.minio.errors.InvalidEndpointException; +import io.minio.errors.InvalidPortException; +import net.didion.jwnl.JWNLException; + +import org.cogcomp.DatastoreException; import org.cogcomp.md.LbjGen.bio_classifier_nam; import org.cogcomp.md.LbjGen.bio_classifier_nom; import org.cogcomp.md.LbjGen.bio_classifier_pro; import org.cogcomp.md.LbjGen.bio_label; +import java.io.IOException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.HashMap; @@ -364,8 +370,13 @@ public static Constituent getConstituent(Constituent curToken, Classifier classi /** * Cross Validation tester + * @throws DatastoreException + * @throws JWNLException + * @throws IOException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public static void test_cv(){ + public static void test_cv() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ boolean isBIO = false; int total_labeled_mention = 0; int total_predicted_mention = 0; @@ -468,8 +479,13 @@ public static void test_cv(){ /** * Test set tester + * @throws JWNLException + * @throws IOException + * @throws DatastoreException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public static void test_ts(){ + public static void test_ts() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ boolean isBIO = false; int total_labeled_mention = 0; int total_predicted_mention = 0; @@ -582,8 +598,13 @@ public static void test_ts(){ /** * ERE corpus tester + * @throws JWNLException + * @throws IOException + * @throws DatastoreException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public static void test_ere(){ + public static void test_ere() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ int total_labeled_mention = 0; int total_predicted_mention = 0; int total_correct_mention = 0; @@ -655,7 +676,7 @@ public static void test_ere(){ System.out.println("F1: " + f); } - public static void test_tac(){ + public static void test_tac() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ int total_labeled_mention = 0; int total_predicted_mention = 0; int total_correct_mention = 0; @@ -744,8 +765,13 @@ public static void calculateAvgMentionLength(){ /** * Test the model trained on hybrid ACE/ERE and evaluated on hybrid ACE/ERE * Produce results on separate types + * @throws DatastoreException + * @throws JWNLException + * @throws IOException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public static void test_hybrid(){ + public static void test_hybrid() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ int total_labeled_mention = 0; int total_predicted_mention = 0; int total_correct_mention = 0; @@ -951,7 +977,7 @@ public static void statistics(){ System.out.println("TAC_NOM: " + tac_nom); } - public static void TrainModel(String corpus){ + public static void TrainModel(String corpus) throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ if (corpus.equals("ACE")) { Parser train_parser_nam = new BIOReader(getPath("all", "ACE", 0), "ACE05-TRAIN", "NAM", false); Parser train_parser_nom = new BIOReader(getPath("all", "ACE", 0), "ACE05-TRAIN", "NOM", false); @@ -970,11 +996,11 @@ else if (corpus.equals("ERE")){ } } - public static void TrainACEModel(){ + public static void TrainACEModel() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ TrainModel("ACE"); } - public static void TrainEREModel(){ + public static void TrainEREModel() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ TrainModel("ERE"); } diff --git a/md/src/main/java/org/cogcomp/md/ExtentReader.java b/md/src/main/java/org/cogcomp/md/ExtentReader.java index 2372d9647..62bc316b9 100644 --- a/md/src/main/java/org/cogcomp/md/ExtentReader.java +++ b/md/src/main/java/org/cogcomp/md/ExtentReader.java @@ -21,9 +21,15 @@ import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader; import edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader; import edu.illinois.cs.cogcomp.pos.POSAnnotator; +import io.minio.errors.InvalidEndpointException; +import io.minio.errors.InvalidPortException; +import net.didion.jwnl.JWNLException; + import org.cogcomp.Datastore; +import org.cogcomp.DatastoreException; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Vector; @@ -46,8 +52,13 @@ public class ExtentReader implements Parser * * @param path The data pth * @param corpus The corpus "ACE/ERE" + * @throws DatastoreException + * @throws JWNLException + * @throws IOException + * @throws InvalidEndpointException + * @throws InvalidPortException */ - public ExtentReader(String path, String corpus){ + public ExtentReader(String path, String corpus) throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ _path = path; _corpus = corpus; taList = getTextAnnotations(); @@ -57,11 +68,15 @@ public ExtentReader(String path, String corpus){ /** * When no corpus is selected, it is set to "ACE" */ - public ExtentReader(String path){ + public ExtentReader(String path) { _path = path; _corpus = "ACE"; - taList = getTextAnnotations(); - pairList = getPairs(); + try { + taList = getTextAnnotations(); + pairList = getPairs(); + } catch (Throwable t) { + throw new RuntimeException("TextAnnotation generation failed",t); + } } /** @@ -74,7 +89,7 @@ public String getId(){ return ret; } - public List getTextAnnotations(){ + public List getTextAnnotations() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ List ret = new ArrayList<>(); if (_corpus.equals("ACE")) { ACEReaderWithTrueCaseFixer aceReader = null; diff --git a/md/src/main/java/org/cogcomp/md/ExtentTester.java b/md/src/main/java/org/cogcomp/md/ExtentTester.java index 9d93fef76..74fbf2a6c 100644 --- a/md/src/main/java/org/cogcomp/md/ExtentTester.java +++ b/md/src/main/java/org/cogcomp/md/ExtentTester.java @@ -28,9 +28,15 @@ import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteersFactory; import edu.illinois.cs.cogcomp.nlp.corpusreaders.ACEReader; import edu.illinois.cs.cogcomp.pos.POSAnnotator; +import io.minio.errors.InvalidEndpointException; +import io.minio.errors.InvalidPortException; +import net.didion.jwnl.JWNLException; + import org.cogcomp.Datastore; +import org.cogcomp.DatastoreException; import java.io.File; +import java.io.IOException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; @@ -65,7 +71,7 @@ public static extent_classifier train_extent_classifier(ExtentReader train_parse return train_extent_classifier(train_parser, null); } - public static void testSimpleExtent(){ + public static void testSimpleExtent() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ int true_labeled = 0; int true_predicted = 0; int true_correct = 0; @@ -204,7 +210,7 @@ public static Constituent getFullMention(extent_classifier classifier, Constitue return fullMention; } - public static void testExtentOnGoldHead(){ + public static void testExtentOnGoldHead() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ int labeled = 0; int correct = 0; POSAnnotator posAnnotator = null; @@ -287,7 +293,7 @@ public static Constituent getPredictedMentionHead(Constituent c){ Integer.parseInt(c.getAttribute("EntityHeadEndSpan"))); } - public static void testExtentOnPredictedHead(){ + public static void testExtentOnPredictedHead() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException{ WordNetManager wordNet = null; Gazetteers gazetteers = null; BrownClusters brownClusters = null; @@ -399,7 +405,7 @@ public static void testExtentOnPredictedHead(){ System.out.println("Total extent correct: " + total_mention_extent_correct); } - public static void TrainModel(String corpus){ + public static void TrainModel(String corpus) throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ if (corpus.equals("ACE")){ ExtentReader e_train_parser = new ExtentReader("data/all", "ACE"); train_extent_classifier(e_train_parser, "models/EXTENT_ACE_TYPE"); @@ -410,11 +416,11 @@ public static void TrainModel(String corpus){ } } - public static void TrainACEModel(){ + public static void TrainACEModel() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException{ TrainModel("ACE"); } - public static void TrainEREModel() { + public static void TrainEREModel() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException { TrainModel("ERE"); }