Merge pull request #563 from mayhewsw/master

Adding transliteration code as a sub project to cogcomp-nlp
CogComp · Oct 16, 2017 · a4458ec · a4458ec
2 parents f15e3c1 + edeec8c
commit a4458ec
Show file tree

Hide file tree

Showing 43 changed files with 5,960 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@
 edison/wordnet/
 **/.project
 **/.classpath
+*~
diff --git a/README.md b/README.md
@@ -19,7 +19,8 @@ Each library contains detailed readme and instructions on how to use it. In addi
 | [curator](curator/README.md) | Supports use of [CogComp NLP Curator](http://cogcomp.cs.illinois.edu/page/software_view/Curator), a tool to run NLP applications as services. |
 | [edison](edison/README.md) | A library for feature extraction from `core-utilities` data structures.  | 
 | [lemmatizer](lemmatizer/README.md)  |  An application that uses [WordNet](https://wordnet.princeton.edu/) and simple rules to find the root forms of words in plain text. |
-| [tokenizer](tokenizer/README.md) | An application that identifies sentence and word boundaries in plain text. | 
+| [tokenizer](tokenizer/README.md) | An application that identifies sentence and word boundaries in plain text. |
+| [transliteration](transliteration/README.md) | An application that transliterates names between different scripts. | 
 | [pos](pos/README.md)  | An application that identifies the part of speech (e.g. verb + tense, noun + number) of each word in plain text.  |  
 | [ner](ner/README.md) | An application that identifies named entities in plain text according to two different sets of categories.  |
 | [md](md/README.md) | An application that identifies entity mentions in plain text.  |

diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/annotation/Annotator.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/annotation/Annotator.java
@@ -14,6 +14,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
 import java.util.Properties;
 import java.util.Set;
 

diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/ViewNames.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/ViewNames.java
@@ -128,6 +128,7 @@ public class ViewNames {
     public static final String POST_ERE = "POST_ERE";
     public static final String EVENT_ERE = "EVENT_ERE";
 
+    public static final String TRANSLITERATION = "TRANSLITERATION";
 
     public static ViewTypes getViewType(String viewName) {
         switch (viewName) {

diff --git a/pipeline/pom.xml b/pipeline/pom.xml
@@ -136,6 +136,11 @@
             <artifactId>illinois-time</artifactId>
             <version>3.1.33</version>
         </dependency>
+        <dependency>
+            <groupId>edu.illinois.cs.cogcomp</groupId>
+            <artifactId>illinois-transliteration</artifactId>
+            <version>3.1.33</version>
+        </dependency>
 
     </dependencies>
 

diff --git a/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/common/PipelineConfigurator.java b/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/common/PipelineConfigurator.java
@@ -40,6 +40,7 @@ public class PipelineConfigurator extends AnnotatorServiceConfigurator {
     public static final Property USE_QUANTIFIER = new Property("useQuantifier", FALSE);
     public static final Property USE_VERB_SENSE = new Property("useVerbSense", FALSE);
     public static final Property USE_JSON = new Property("useJson", FALSE);
+    public static final Property USE_TRANSLITERATION = new Property("useTransliteration", FALSE);
     public static final Property USE_MENTION = new Property("useMention", FALSE);
     public static final Property USE_LAZY_INITIALIZATION = new Property(
             AnnotatorConfigurator.IS_LAZILY_INITIALIZED.key, TRUE);

diff --git a/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/main/PipelineFactory.java b/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/main/PipelineFactory.java
@@ -121,6 +121,10 @@ public static BasicAnnotatorService buildPipeline(Boolean disableCache, String..
                         nonDefaultValues.put(PipelineConfigurator.USE_VERB_SENSE.key,
                                 Configurator.TRUE);
                         break;
+                    case ViewNames.TRANSLITERATION:
+                        nonDefaultValues.put(PipelineConfigurator.USE_TRANSLITERATION.key,
+                                Configurator.TRUE);
+                        break;
                     case ViewNames.TIMEX3:
                         nonDefaultValues.put(PipelineConfigurator.USE_TIMEX3.key,
                                 Configurator.TRUE);
@@ -346,6 +350,11 @@ private static Map<String, Annotator> buildAnnotators(ResourceManager nonDefault
             viewGenerators.put(ViewNames.QUANTITIES, quantifierAnnotator);
         }
 
+        if (rm.getBoolean(PipelineConfigurator.USE_TRANSLITERATION)) {
+            TransliterationAnnotator transliterationAnnotator = new TransliterationAnnotator();
+            viewGenerators.put(ViewNames.TRANSLITERATION, transliterationAnnotator);
+        }
+
         if (rm.getBoolean(PipelineConfigurator.USE_SRL_PREP)) {
             PrepSRLAnnotator prepSRLAnnotator = new PrepSRLAnnotator();
             viewGenerators.put(ViewNames.SRL_PREP, prepSRLAnnotator);

diff --git a/pom.xml b/pom.xml
@@ -11,6 +11,7 @@
     <modules>
         <module>core-utilities</module>
         <module>tokenizer</module>
+        <module>transliteration</module>
         <module>lemmatizer</module>
         <module>edison</module>
         <module>curator</module>

diff --git a/transliteration/README.md b/transliteration/README.md
@@ -0,0 +1,60 @@
+# Transliteration
+
+This is a Java port of Jeff Pasternack's C# code from [Learning Better Transliterations](http://cogcomp.org/page/publication_view/205)
+
+See examples in [TestTransliteration](src/test/java/edu/illinois/cs/cogcomp/transliteration/TestTransliteration.java)
+or [Runner](src/main/java/edu/illinois/cs/cogcomp/transliteration/Runner.java).
+
+
+## Training data
+
+To train a model, you need pairs of names. A common source is Wikipedia interlanguage links. For example, 
+see [this data](http://www.clsp.jhu.edu/~anni/data/wikipedia_names) 
+from [Transliterating From All Languages](http://cis.upenn.edu/~ccb/publications/transliterating-from-all-languages.pdf)
+by Anne Irvine et al.
+
+The standard data format expected is:
+```bash
+foreign<tab>english
+```
+
+That said, the [Utils class](src/main/java/edu/illinois/cs/cogcomp/utils/Utils.java) has readers for many 
+different datasets (including Anne Irvine's data).  
+
+## Training a model
+The standard class is the [SPModel](src/main/java/edu/illinois/cs/cogcomp/transliteration/SPModel.java). Use it 
+as follows:
+
+```java
+List<Example> training = Utils.readWikiData(trainfile);
+SPModel model = new SPModel(training);
+model.Train(10);
+model.WriteProbs(modelfile);
+
+```
+
+This will train a model, and write it to the path specified by `modelfile`.
+
+`SPModel` has another useful function called `Probability(source, target)`, which will return the transliteration probability
+of a given pair. 
+
+## Annotating
+A trained model can be used immediately after training, or you can initialize `SPModel` using a 
+previously trained and saved `modelfile`.
+
+```java
+SPModel model = new SPModel(modelfile);
+model.setMaxCandidates(10);
+TopList<Double,String> predictions = model.Generate(testexample);
+```  
+
+We limited the max number of candidates to 10, so `predictions` will have at most 10 elements. These
+are sorted by score, highest to lowest, where the first element is the best.
+
+## Interactive
+
+Once you have trained a model, it is often helpful to try interacting with it. Use [interactive.sh](scripts/interactive.sh)
+for this:
+```bash
+$ ./scripts/interactive.sh models/modelfile
+```
diff --git a/transliteration/config/project.properties b/transliteration/config/project.properties
@@ -0,0 +1,3 @@
+# Use ResourceManager to read these properties
+CuratorHost = trollope.cs.illinois.edu
+CuratorPort = 9010
diff --git a/transliteration/pom.xml b/transliteration/pom.xml
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd
+http://www.w3.org/2001/XMLSchema-instance ">
+    <parent>
+        <artifactId>illinois-cogcomp-nlp</artifactId>
+        <groupId>edu.illinois.cs.cogcomp</groupId>
+        <version>3.1.33</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>illinois-transliteration</artifactId>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+    </properties>
+
+    <repositories>
+        <repository>
+            <id>CogcompSoftware</id>
+            <name>CogcompSoftware</name>
+            <url>http://cogcomp.cs.illinois.edu/m2repo/</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>3.8.1</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>edu.illinois.cs.cogcomp</groupId>
+            <artifactId>illinois-core-utilities</artifactId>
+            <version>3.1.33</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.4</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>com.belerweb</groupId>
+            <artifactId>pinyin4j</artifactId>
+            <version>2.5.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-log4j12</artifactId>
+            <version>1.7.13</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j</artifactId>
+            <version>56.1</version>
+        </dependency>
+
+        <dependency>
+            <groupId>edu.illinois.cs.cogcomp</groupId>
+            <artifactId>illinois-abstract-server</artifactId>
+            <version>0.1</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.illinois.cs.cogcomp</groupId>
+            <artifactId>curator-interfaces</artifactId>
+            <version>0.7</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.thrift</groupId>
+            <artifactId>libthrift</artifactId>
+            <version>0.8.0</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.illinois.cs.cogcomp</groupId>
+            <artifactId>curator-utils</artifactId>
+            <version>0.0.4-SNAPSHOT</version>
+        </dependency>
+
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.0.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>2.1.2</version>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+        <resources>
+            <resource>
+                <directory>src/main/resources</directory>
+            </resource>
+        </resources>
+        <extensions>
+            <extension>
+                <groupId>org.apache.maven.wagon</groupId>
+                <artifactId>wagon-ssh</artifactId>
+                <version>2.4</version>
+            </extension>
+        </extensions>
+    </build>
+
+</project>
diff --git a/transliteration/scripts/interactive.sh b/transliteration/scripts/interactive.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+cpath="target/classes:target/dependency/*:config"
+MODEL=$1
+
+CMD="java -classpath  ${cpath} -Xmx8g edu.illinois.cs.cogcomp.transliteration.Interactive $MODEL"
+echo "Running: $CMD"
+${CMD}
diff --git a/transliteration/scripts/release.sh b/transliteration/scripts/release.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+  echo "usage: $0 <package-name>"
+  exit
+fi
+
+# Get the current version
+VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -v INFO`
+
+## DON'T FORGET TO CHANGE VERSION IF THIS IS A NEW RELEASE!!!
+PACKAGE_NAME=$1
+
+echo "The script should run the following commands for package: ${PACKAGE_NAME}-${VERSION}"
+
+## Deploy the Maven release
+echo "mvn javadoc:jar deploy"
+
+## Update the GitLab repository (also create a tag)
+echo "git tag v${VERSION} -m \"Releasing ${PACKAGE_NAME}-${VERSION}\""
+
+echo "git push --tags"
+
+
+## Generate the distribution package
+echo -n "Generating the distribution package ..."
+
+## Create a temporary directory
+TEMP_DIR="temp90614"
+PACKAGE_DIR="${TEMP_DIR}/${PACKAGE_NAME}-${VERSION}"
+
+mvn dependency:copy-dependencies
+
+mkdir -p ${PACKAGE_DIR}
+mkdir ${PACKAGE_DIR}/lib
+mkdir ${PACKAGE_DIR}/dist
+mkdir -p ${PACKAGE_DIR}/doc/javadoc
+mkdir ${PACKAGE_DIR}/src
+mkdir ${PACKAGE_DIR}/scripts
+
+mv target/${PACKAGE_NAME}-${VERSION}.jar ${PACKAGE_DIR}/dist/
+mv target/${PACKAGE_NAME}-${VERSION}-sources.jar ${PACKAGE_DIR}/src/
+unzip target/${PACKAGE_NAME}-${VERSION}-javadoc.jar -d ${PACKAGE_DIR}/doc/javadoc
+mv target/dependency/* ${PACKAGE_DIR}/lib/
+cp doc/* ${PACKAGE_DIR}/doc
+cp scripts/* ${PACKAGE_DIR}/scripts
+
+cd ${TEMP_DIR}
+zip -r ../${PACKAGE_NAME}.zip ${PACKAGE_NAME}-${VERSION}
+cd ..
+
+rm -rf ${TEMP_DIR}
+echo "Distribution package created: ${PACKAGE_NAME}.zip"
diff --git a/transliteration/scripts/runner.sh b/transliteration/scripts/runner.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cpath="target/classes:target/dependency/*:config"
+DIR="/path/to/transliteration/data"
+TRAIN=$DIR/train.data
+TEST=$DIR/test.data
+
+CMD="java -classpath  ${cpath} -Xmx8g edu.illinois.cs.cogcomp.transliteration.Runner $TRAIN $TEST"
+echo "Running: $CMD"
+${CMD}