-
Notifications
You must be signed in to change notification settings - Fork 142
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #424 from lingss0918/master
similarity package
- Loading branch information
Showing
86 changed files
with
205,899 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
wordMetric paragram | ||
usePhraseSim false | ||
useNER false | ||
debug false | ||
useStopwords true | ||
useLemmas true | ||
useMetric true | ||
stopwordFile llmStopwords.txt | ||
useSimpleScore false | ||
wordEntailmentThreshold 0.00001 | ||
llmThreshold 0.5 | ||
paragram_dim 25 | ||
embedding_dim 200 | ||
paragram src/main/resources/paragram_vectors.txt | ||
phrase_dict src/main/resources/phrases.txt | ||
customized src/main/resources/paragram_vectors.txt | ||
customized_embedding_dim 25 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Root logger option | ||
log4j.rootLogger=INFO, file | ||
|
||
# Direct log messages to a log file | ||
log4j.appender.file=org.apache.log4j.RollingFileAppender | ||
|
||
|
||
log4j.appender.file.File=llm.log | ||
log4j.appender.file.MaxFileSize=10MB | ||
log4j.appender.file.MaxBackupIndex=10 | ||
log4j.appender.file.layout=org.apache.log4j.PatternLayout | ||
log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
memorybasedESA src/test/resources/MemoryBasedESA_test.txt | ||
pageIDMapping src/test/resources/wikiPageIDMapping_test.txt | ||
phrase_dict src/main/resources/phrases.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,66 @@ | ||
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<artifactId>illinois-cogcomp-nlp</artifactId> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<version>3.1.22</version> | ||
</parent> | ||
<parent> | ||
<artifactId>illinois-cogcomp-nlp</artifactId> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<version>3.1.22</version> | ||
</parent> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<artifactId>illinois-similarity</artifactId> | ||
|
||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>illinois-core-utilities</artifactId> | ||
<version>3.1.22</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>slf4j-log4j12</artifactId> | ||
<version>1.7.12</version> | ||
<optional>true</optional> | ||
</dependency> | ||
</dependencies> | ||
<artifactId>illinois-similarity</artifactId> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>illinois-core-utilities</artifactId> | ||
<version>3.1.21</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.cogcomp</groupId> | ||
<artifactId>cogcomp-datastore</artifactId> | ||
<version>1.9.7</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>illinois-ner</artifactId> | ||
<version>3.1.21</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>slf4j-log4j12</artifactId> | ||
<version>1.7.12</version> | ||
<optional>true</optional> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.mapdb</groupId> | ||
<artifactId>mapdb</artifactId> | ||
<version>3.0.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>illinois-wnsim</artifactId> | ||
<version>2.2.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>illinois-phrasesim</artifactId> | ||
<version>1.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>illinois-tokenizer</artifactId> | ||
<version>3.1.21</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>edu.illinois.cs.cogcomp</groupId> | ||
<artifactId>DatalessClassification</artifactId> | ||
<version>0.0.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.wcohen</groupId> | ||
<artifactId>SecondString</artifactId> | ||
<version>1.0</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
20 changes: 20 additions & 0 deletions
20
similarity/src/main/java/edu/illinois/cs/cogcomp/config/EmbeddingConstant.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
/** | ||
* This software is released under the University of Illinois/Research and Academic Use License. See | ||
* the LICENSE file in the root folder for details. Copyright (c) 2016 | ||
* | ||
* Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign | ||
* http://cogcomp.cs.illinois.edu/ | ||
*/ | ||
package edu.illinois.cs.cogcomp.config; | ||
|
||
public class EmbeddingConstant { | ||
public static String word2vec = "word2vec"; | ||
public static String paragram = "paragram"; | ||
public static String memorybasedESA = "memorybasedESA"; | ||
public static String pageIDMapping = "pageIDMapping"; | ||
public static String glove = "glove"; | ||
public static String wordnet = "wordnet"; | ||
public static String phrase2vec = "phrase2vec"; | ||
public static String customized = "customized"; | ||
|
||
} |
36 changes: 36 additions & 0 deletions
36
similarity/src/main/java/edu/illinois/cs/cogcomp/config/NESimConfigurator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/** | ||
* This software is released under the University of Illinois/Research and Academic Use License. See | ||
* the LICENSE file in the root folder for details. Copyright (c) 2016 | ||
* | ||
* Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign | ||
* http://cogcomp.cs.illinois.edu/ | ||
*/ | ||
package edu.illinois.cs.cogcomp.config; | ||
|
||
import edu.illinois.cs.cogcomp.core.utilities.configuration.*; | ||
|
||
public abstract class NESimConfigurator extends Configurator { | ||
|
||
public static final Property ACRONYM_FILE = new Property("acronymFile", "acronyms.txt"); | ||
public static final Property CL_FILE = new Property("countryLanguageFile", "countrylanguage.txt"); | ||
public static final Property HONORIFICS_FILE = new Property("honorificsFile", "honorifics.txt"); | ||
public static final Property LOCATION_FILE = new Property("locationFile", "locations.txt"); | ||
public static final Property NICKNAME_FILE = new Property("nicknameFile", "nicknames.txt"); | ||
public static final Property PEOPLE_FILE = new Property("peopleFile", "people.txt"); | ||
public static final Property SHORTCUT_FILE = new Property("shortcutFile", "shortcuts.txt"); | ||
public static final Property SIMILARITY_THRESHOLD = new Property("similarityThreshold", "0.5"); | ||
|
||
/** | ||
* get a ResourceManager object with the default key/value pairs for this | ||
* configurator | ||
* | ||
* @return a non-null ResourceManager with appropriate values set. | ||
*/ | ||
@Override | ||
public ResourceManager getDefaultConfig() { | ||
Property[] props = { ACRONYM_FILE, CL_FILE, HONORIFICS_FILE, LOCATION_FILE, NICKNAME_FILE, PEOPLE_FILE, | ||
SHORTCUT_FILE, SIMILARITY_THRESHOLD }; | ||
return new ResourceManager(generateProperties(props)); | ||
} | ||
|
||
} |
49 changes: 49 additions & 0 deletions
49
similarity/src/main/java/edu/illinois/cs/cogcomp/config/SimConfigurator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/** | ||
* This software is released under the University of Illinois/Research and Academic Use License. See | ||
* the LICENSE file in the root folder for details. Copyright (c) 2016 | ||
* | ||
* Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign | ||
* http://cogcomp.cs.illinois.edu/ | ||
*/ | ||
package edu.illinois.cs.cogcomp.config; | ||
|
||
import edu.illinois.cs.cogcomp.core.utilities.configuration.Configurator; | ||
import edu.illinois.cs.cogcomp.core.utilities.configuration.Property; | ||
import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; | ||
|
||
public class SimConfigurator extends Configurator { | ||
public static final Property PHRASE_DICT = new Property("phrase_dict", "src/main/resources/phrases.txt"); | ||
public static final Property USE_NE_COMPARISON = new Property("useNER", Configurator.FALSE); | ||
public static final Property USE_PHRASE_COMPARISON = new Property("usePhraseSim", Configurator.FALSE); | ||
public static final Property USE_SIMPLE_SCORE = new Property("useSimpleScore", Configurator.FALSE); | ||
public static final Property STOPWORD_FILE = new Property("stopwordFile", "llmStopwords.txt"); | ||
public static final Property WORD_METRIC = new Property("wordMetric", "wordnet"); | ||
public static final Property WORD_ENTAILMENT_THRESHOLD = new Property("wordEntailmentThreshold", "0.001"); | ||
public static final Property LLM_ENTAILMENT_THRESHOLD = new Property("llmThreshold", "0.5"); | ||
public static final Property WORD2VEC = new Property("word2vec", ""); | ||
public static final Property PARAGRAM = new Property("paragram", "src/main/resources/paragram_vectors.txt"); | ||
public static final Property GLOVE = new Property("glove", ""); | ||
public static final Property PHRASE2VEC = new Property("phrase2vec", ""); | ||
public static final Property MEMORYBASEDESA = new Property("memorybasedESA", ""); | ||
public static final Property PARAGRAM_DIM = new Property("paragram_dim", "25"); | ||
public static final Property PAGE_ID_MAPPING = new Property("pageIDMapping", ""); | ||
public static final Property EMBEDDING_DIM = new Property("embedding_dim", "200"); | ||
public static final Property CUSTOMIZED = new Property("customized", "src/main/resources/paragram_vectors.txt"); | ||
public static final Property CUSTOMIZED_EMBEDDING_DIM = new Property("customized_embedding_dim", "25"); | ||
|
||
@Override | ||
public ResourceManager getDefaultConfig() { | ||
Property[] props = { WORD2VEC, PARAGRAM, GLOVE, PHRASE2VEC, MEMORYBASEDESA, PARAGRAM_DIM, PAGE_ID_MAPPING, | ||
EMBEDDING_DIM, USE_NE_COMPARISON, USE_PHRASE_COMPARISON, USE_SIMPLE_SCORE, STOPWORD_FILE, WORD_METRIC, | ||
WORD_ENTAILMENT_THRESHOLD, LLM_ENTAILMENT_THRESHOLD }; | ||
return new ResourceManager(generateProperties(props)); | ||
} | ||
|
||
public ResourceManager metricsConfig(String metrics, String file) throws Exception { | ||
Property metric = new Property("wordMetric", metrics); | ||
Property[] props = { metric }; | ||
ResourceManager rm_ = new ResourceManager(generateProperties(props)); | ||
return super.mergeProperties(new ResourceManager(file), rm_); | ||
} | ||
|
||
} |
73 changes: 73 additions & 0 deletions
73
similarity/src/main/java/edu/illinois/cs/cogcomp/llm/align/WordListFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
/** | ||
* This software is released under the University of Illinois/Research and Academic Use License. See | ||
* the LICENSE file in the root folder for details. Copyright (c) 2016 | ||
* | ||
* Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign | ||
* http://cogcomp.cs.illinois.edu/ | ||
*/ | ||
package edu.illinois.cs.cogcomp.llm.align; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; | ||
import edu.illinois.cs.cogcomp.mrcs.align.ListFilter; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import edu.illinois.cs.cogcomp.core.io.LineIO; | ||
|
||
public class WordListFilter implements ListFilter<String> { | ||
public static final java.lang.String STOPWORD_FILE = "stopwordFile"; | ||
|
||
private static final String NAME = WordListFilter.class.getCanonicalName(); | ||
private Logger logger = LoggerFactory.getLogger(WordListFilter.class); | ||
private String m_stopwordFile; | ||
private Set<String> m_stopwords; | ||
|
||
public WordListFilter(ResourceManager rm_) throws IOException { | ||
m_stopwordFile = rm_.getString(STOPWORD_FILE); | ||
loadStopwords(); | ||
} | ||
|
||
/** | ||
* filter non-content words from an input array of string | ||
* | ||
* @param elements_ | ||
* a list of input words | ||
* @return an array of string containing only non-stopwords | ||
*/ | ||
|
||
@Override | ||
public String[] filter(String[] elements_) { | ||
String[] filteredElts = new String[elements_.length]; | ||
|
||
for (int i = 0; i < elements_.length; ++i) { | ||
String lcTok = elements_[i].toLowerCase(); | ||
|
||
if (!m_stopwords.contains(lcTok)) | ||
filteredElts[i] = elements_[i]; | ||
else | ||
filteredElts[i] = null; | ||
|
||
logger.debug( | ||
(null == filteredElts[i] ? "FILTERED" : "DID NOT FILTER") + " element '" + elements_[i] + "'."); | ||
} | ||
|
||
return filteredElts; | ||
} | ||
|
||
protected void loadStopwords() throws IOException { | ||
|
||
ArrayList<String> lines = LineIO.readFromClasspath(m_stopwordFile); | ||
m_stopwords = new HashSet<String>(); | ||
|
||
for (String line : lines) { | ||
m_stopwords.add(line.toLowerCase()); | ||
} | ||
|
||
} | ||
|
||
} |
Oops, something went wrong.