Merge pull request #424 from lingss0918/master

similarity package
CogComp · Jul 14, 2017 · 8193e05 · 8193e05
2 parents 3ec8168 + 61e2dd4
commit 8193e05
Show file tree

Hide file tree

Showing 86 changed files with 205,899 additions and 53 deletions.
diff --git a/similarity/README.md b/similarity/README.md
@@ -2,7 +2,59 @@
 
 This module specifies a simple API for NLP components that compare
 objects -- especially Strings -- and return a score indicating how
-similar they are.  It is used in our WordNet-, Named Entity-, embedding-, 
-and paraphrase-based similarity code to simplify integration of 
+similar they are.  It is used in our WordNet-, Named Entity-, embedding-,
+and paraphrase-based similarity code to simplify integration of
 different similarity resources.
-
+
+## Download Resource File
+
+Once you first use the specific metrics, the system will automatically download corresponding resource file from CogComp server to `user.home` directory in your local machine.
+Notice: Some resource file is very large and it may take a while to download. "paragram" is already included in the `src/main/resources/`.
+
+## Configure File
+
+The default configure file is `config/configurations.properties.` And see the default config in `SimConfigurator` in `edu.illinois.cs.cogcomp.config` package.
+
+`wordMetric` is the  word comparison metric. It can be chosen from "word2vec", "paragram", "esa", "glove", "wordnet" "phrase2vec" or "customized" (your own embedding file). Notice: This metric will also be used as word comparator in LLM.
+
+`usePhraseSim` option will automatically tokenized the sentence into phrase-based units when comparing sentences and notice it should be used with "phrase2vec".
+
+`useNER` option will run NER on sentence and compare name-entity using NE comparison metrics in LLM.
+
+`customized` gives your option to use your own embedding file. Just put the location of the file at this field and the dimension of the embedding at the filed `customized_embedding_dim`.
+
+## Word similarity
+To use word comparison metric:
+```java
+//initialization
+ResourceManager rm_ = new SimConfigurator().getConfig(new ResourceManager(file));
+WordSim ws = new WordSim(rm_, metric);
+//ws.compare(word1,word2,metric);
+double score=ws.compare("word", "sentence", metric);
+```
+And the metric can be chosen from "word2vec", "paragram", "esa", "glove", "wordnet" or "phrase2vec" (provided you have downloaded the relevant data resource -- see above).
+
+
+## Name Entity Comparison
+To use name entity comparison metric:
+
+```java
+NESim nesim=new NESim();
+double score=nesim.compare("Donald Trump", "Trump");
+```
+
+## Lexical Level Matching
+To use lexical level matching comparison:
+```java
+String config = "config/configurations.properties";
+Metric llm =new LLMStringSim(config);
+String s1="please turn off the light";
+String s1="please turn on the monitor";
+double score=nesim.compare(s1,s2);
+```
+
+To get the basic LLM similarity score, just set `usePhraseSim` and `useNER` as false in config file (default setting).
+
+To use `usePhraseSim` option, set it as true and use `phrase2vec` as `wordMetric`. The system can tokenized the sentence into phrase-based units and it will reformat the sentence. E.g. "please turn the light on" => "please turn-on the light".
+
+To use `useNER` option, set it as true. The system will run NER on the sentences first and comparing name entity and words separately. Notice: the NER initialization takes a lot of memory.
diff --git a/similarity/config/configurations.properties b/similarity/config/configurations.properties
@@ -0,0 +1,17 @@
+wordMetric	paragram
+usePhraseSim	false
+useNER	false
+debug	false
+useStopwords	true
+useLemmas	true
+useMetric	true
+stopwordFile	llmStopwords.txt
+useSimpleScore	false
+wordEntailmentThreshold	0.00001
+llmThreshold	0.5
+paragram_dim	25
+embedding_dim	200
+paragram	src/main/resources/paragram_vectors.txt
+phrase_dict	src/main/resources/phrases.txt
+customized	src/main/resources/paragram_vectors.txt
+customized_embedding_dim	25
diff --git a/similarity/config/log4j.properties b/similarity/config/log4j.properties
@@ -0,0 +1,12 @@
+# Root logger option
+log4j.rootLogger=INFO, file
+
+# Direct log messages to a log file
+log4j.appender.file=org.apache.log4j.RollingFileAppender
+
+
+log4j.appender.file.File=llm.log
+log4j.appender.file.MaxFileSize=10MB
+log4j.appender.file.MaxBackupIndex=10
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
diff --git a/similarity/config/test.configurations.properties b/similarity/config/test.configurations.properties
@@ -0,0 +1,3 @@
+memorybasedESA	src/test/resources/MemoryBasedESA_test.txt
+pageIDMapping	src/test/resources/wikiPageIDMapping_test.txt
+phrase_dict	src/main/resources/phrases.txt
diff --git a/similarity/pom.xml b/similarity/pom.xml
@@ -1,28 +1,66 @@
 <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-    <parent>
-        <artifactId>illinois-cogcomp-nlp</artifactId>
-        <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>3.1.22</version>
-    </parent>
+  <parent>
+    <artifactId>illinois-cogcomp-nlp</artifactId>
+    <groupId>edu.illinois.cs.cogcomp</groupId>
+    <version>3.1.22</version>
+  </parent>
 
-    <modelVersion>4.0.0</modelVersion>
+  <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>illinois-similarity</artifactId>
-
-
-    <dependencies>
-        <dependency>
-            <groupId>edu.illinois.cs.cogcomp</groupId>
-            <artifactId>illinois-core-utilities</artifactId>
-            <version>3.1.22</version>
-        </dependency>
-        <dependency>
-            <groupId>org.slf4j</groupId>
-            <artifactId>slf4j-log4j12</artifactId>
-            <version>1.7.12</version>
-            <optional>true</optional>
-        </dependency>
-    </dependencies>
+  <artifactId>illinois-similarity</artifactId>
 
+	<dependencies>
+		<dependency>
+			<groupId>edu.illinois.cs.cogcomp</groupId>
+			<artifactId>illinois-core-utilities</artifactId>
+			<version>3.1.21</version>
+		</dependency>
+		<dependency>
+			<groupId>org.cogcomp</groupId>
+			<artifactId>cogcomp-datastore</artifactId>
+			<version>1.9.7</version>
+		</dependency>
+		<dependency>
+			<groupId>edu.illinois.cs.cogcomp</groupId>
+			<artifactId>illinois-ner</artifactId>
+			<version>3.1.21</version>
+		</dependency>
+		<dependency>
+			<groupId>org.slf4j</groupId>
+			<artifactId>slf4j-log4j12</artifactId>
+			<version>1.7.12</version>
+			<optional>true</optional>
+		</dependency>
+		<dependency>
+			<groupId>org.mapdb</groupId>
+			<artifactId>mapdb</artifactId>
+			<version>3.0.3</version>
+		</dependency>
+		<dependency>
+			<groupId>edu.illinois.cs.cogcomp</groupId>
+			<artifactId>illinois-wnsim</artifactId>
+			<version>2.2.1</version>
+		</dependency>
+		<dependency>
+			<groupId>edu.illinois.cs.cogcomp</groupId>
+			<artifactId>illinois-phrasesim</artifactId>
+			<version>1.1</version>
+		</dependency>
+		<dependency>
+			<groupId>edu.illinois.cs.cogcomp</groupId>
+			<artifactId>illinois-tokenizer</artifactId>
+			<version>3.1.21</version>
+		</dependency>
+		<dependency>
+			<groupId>edu.illinois.cs.cogcomp</groupId>
+			<artifactId>DatalessClassification</artifactId>
+			<version>0.0.1</version>
+		</dependency>
+		<dependency>
+			<groupId>com.wcohen</groupId>
+			<artifactId>SecondString</artifactId>
+			<version>1.0</version>
+		</dependency>
+	</dependencies>
 
 </project>
diff --git a/similarity/src/main/java/edu/illinois/cs/cogcomp/config/EmbeddingConstant.java b/similarity/src/main/java/edu/illinois/cs/cogcomp/config/EmbeddingConstant.java
@@ -0,0 +1,20 @@
+/**
+ * This software is released under the University of Illinois/Research and Academic Use License. See
+ * the LICENSE file in the root folder for details. Copyright (c) 2016
+ *
+ * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
+ * http://cogcomp.cs.illinois.edu/
+ */
+package edu.illinois.cs.cogcomp.config;
+
+public class EmbeddingConstant {
+	public static String word2vec = "word2vec";
+	public static String paragram = "paragram";
+	public static String memorybasedESA = "memorybasedESA";
+	public static String pageIDMapping = "pageIDMapping";
+	public static String glove = "glove";
+	public static String wordnet = "wordnet";
+	public static String phrase2vec = "phrase2vec";
+	public static String customized = "customized";
+
+}
diff --git a/similarity/src/main/java/edu/illinois/cs/cogcomp/config/NESimConfigurator.java b/similarity/src/main/java/edu/illinois/cs/cogcomp/config/NESimConfigurator.java
@@ -0,0 +1,36 @@
+/**
+ * This software is released under the University of Illinois/Research and Academic Use License. See
+ * the LICENSE file in the root folder for details. Copyright (c) 2016
+ *
+ * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
+ * http://cogcomp.cs.illinois.edu/
+ */
+package edu.illinois.cs.cogcomp.config;
+
+import edu.illinois.cs.cogcomp.core.utilities.configuration.*;
+
+public abstract class NESimConfigurator extends Configurator {
+
+	public static final Property ACRONYM_FILE = new Property("acronymFile", "acronyms.txt");
+	public static final Property CL_FILE = new Property("countryLanguageFile", "countrylanguage.txt");
+	public static final Property HONORIFICS_FILE = new Property("honorificsFile", "honorifics.txt");
+	public static final Property LOCATION_FILE = new Property("locationFile", "locations.txt");
+	public static final Property NICKNAME_FILE = new Property("nicknameFile", "nicknames.txt");
+	public static final Property PEOPLE_FILE = new Property("peopleFile", "people.txt");
+	public static final Property SHORTCUT_FILE = new Property("shortcutFile", "shortcuts.txt");
+	public static final Property SIMILARITY_THRESHOLD = new Property("similarityThreshold", "0.5");
+
+	/**
+	 * get a ResourceManager object with the default key/value pairs for this
+	 * configurator
+	 *
+	 * @return a non-null ResourceManager with appropriate values set.
+	 */
+	@Override
+	public ResourceManager getDefaultConfig() {
+		Property[] props = { ACRONYM_FILE, CL_FILE, HONORIFICS_FILE, LOCATION_FILE, NICKNAME_FILE, PEOPLE_FILE,
+				SHORTCUT_FILE, SIMILARITY_THRESHOLD };
+		return new ResourceManager(generateProperties(props));
+	}
+
+}
diff --git a/similarity/src/main/java/edu/illinois/cs/cogcomp/config/SimConfigurator.java b/similarity/src/main/java/edu/illinois/cs/cogcomp/config/SimConfigurator.java
@@ -0,0 +1,49 @@
+/**
+ * This software is released under the University of Illinois/Research and Academic Use License. See
+ * the LICENSE file in the root folder for details. Copyright (c) 2016
+ *
+ * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
+ * http://cogcomp.cs.illinois.edu/
+ */
+package edu.illinois.cs.cogcomp.config;
+
+import edu.illinois.cs.cogcomp.core.utilities.configuration.Configurator;
+import edu.illinois.cs.cogcomp.core.utilities.configuration.Property;
+import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager;
+
+public class SimConfigurator extends Configurator {
+	public static final Property PHRASE_DICT = new Property("phrase_dict", "src/main/resources/phrases.txt");
+	public static final Property USE_NE_COMPARISON = new Property("useNER", Configurator.FALSE);
+	public static final Property USE_PHRASE_COMPARISON = new Property("usePhraseSim", Configurator.FALSE);
+	public static final Property USE_SIMPLE_SCORE = new Property("useSimpleScore", Configurator.FALSE);
+	public static final Property STOPWORD_FILE = new Property("stopwordFile", "llmStopwords.txt");
+	public static final Property WORD_METRIC = new Property("wordMetric", "wordnet");
+	public static final Property WORD_ENTAILMENT_THRESHOLD = new Property("wordEntailmentThreshold", "0.001");
+	public static final Property LLM_ENTAILMENT_THRESHOLD = new Property("llmThreshold", "0.5");
+	public static final Property WORD2VEC = new Property("word2vec", "");
+	public static final Property PARAGRAM = new Property("paragram", "src/main/resources/paragram_vectors.txt");
+	public static final Property GLOVE = new Property("glove", "");
+	public static final Property PHRASE2VEC = new Property("phrase2vec", "");
+	public static final Property MEMORYBASEDESA = new Property("memorybasedESA", "");
+	public static final Property PARAGRAM_DIM = new Property("paragram_dim", "25");
+	public static final Property PAGE_ID_MAPPING = new Property("pageIDMapping", "");
+	public static final Property EMBEDDING_DIM = new Property("embedding_dim", "200");
+	public static final Property CUSTOMIZED = new Property("customized", "src/main/resources/paragram_vectors.txt");
+	public static final Property CUSTOMIZED_EMBEDDING_DIM = new Property("customized_embedding_dim", "25");
+
+	@Override
+	public ResourceManager getDefaultConfig() {
+		Property[] props = { WORD2VEC, PARAGRAM, GLOVE, PHRASE2VEC, MEMORYBASEDESA, PARAGRAM_DIM, PAGE_ID_MAPPING,
+				EMBEDDING_DIM, USE_NE_COMPARISON, USE_PHRASE_COMPARISON, USE_SIMPLE_SCORE, STOPWORD_FILE, WORD_METRIC,
+				WORD_ENTAILMENT_THRESHOLD, LLM_ENTAILMENT_THRESHOLD };
+		return new ResourceManager(generateProperties(props));
+	}
+
+	public ResourceManager metricsConfig(String metrics, String file) throws Exception {
+		Property metric = new Property("wordMetric", metrics);
+		Property[] props = { metric };
+		ResourceManager rm_ = new ResourceManager(generateProperties(props));
+		return super.mergeProperties(new ResourceManager(file), rm_);
+	}
+
+}
diff --git a/similarity/src/main/java/edu/illinois/cs/cogcomp/llm/align/WordListFilter.java b/similarity/src/main/java/edu/illinois/cs/cogcomp/llm/align/WordListFilter.java
@@ -0,0 +1,73 @@
+/**
+ * This software is released under the University of Illinois/Research and Academic Use License. See
+ * the LICENSE file in the root folder for details. Copyright (c) 2016
+ *
+ * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
+ * http://cogcomp.cs.illinois.edu/
+ */
+package edu.illinois.cs.cogcomp.llm.align;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
+
+import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager;
+import edu.illinois.cs.cogcomp.mrcs.align.ListFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import edu.illinois.cs.cogcomp.core.io.LineIO;
+
+public class WordListFilter implements ListFilter<String> {
+	public static final java.lang.String STOPWORD_FILE = "stopwordFile";
+
+	private static final String NAME = WordListFilter.class.getCanonicalName();
+	private Logger logger = LoggerFactory.getLogger(WordListFilter.class);
+	private String m_stopwordFile;
+	private Set<String> m_stopwords;
+
+	public WordListFilter(ResourceManager rm_) throws IOException {
+		m_stopwordFile = rm_.getString(STOPWORD_FILE);
+		loadStopwords();
+	}
+
+	/**
+	 * filter non-content words from an input array of string
+	 * 
+	 * @param elements_
+	 *            a list of input words
+	 * @return an array of string containing only non-stopwords
+	 */
+
+	@Override
+	public String[] filter(String[] elements_) {
+		String[] filteredElts = new String[elements_.length];
+
+		for (int i = 0; i < elements_.length; ++i) {
+			String lcTok = elements_[i].toLowerCase();
+
+			if (!m_stopwords.contains(lcTok))
+				filteredElts[i] = elements_[i];
+			else
+				filteredElts[i] = null;
+
+			logger.debug(
+					(null == filteredElts[i] ? "FILTERED" : "DID NOT FILTER") + " element '" + elements_[i] + "'.");
+		}
+
+		return filteredElts;
+	}
+
+	protected void loadStopwords() throws IOException {
+
+		ArrayList<String> lines = LineIO.readFromClasspath(m_stopwordFile);
+		m_stopwords = new HashSet<String>();
+
+		for (String line : lines) {
+			m_stopwords.add(line.toLowerCase());
+		}
+
+	}
+
+}