Skip to content

Commit

Permalink
Merge pull request #563 from mayhewsw/master
Browse files Browse the repository at this point in the history
Adding transliteration code as a sub project to cogcomp-nlp
  • Loading branch information
Daniel Khashabi authored Oct 16, 2017
2 parents f15e3c1 + edeec8c commit a4458ec
Show file tree
Hide file tree
Showing 43 changed files with 5,960 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
edison/wordnet/
**/.project
**/.classpath
*~
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ Each library contains detailed readme and instructions on how to use it. In addi
| [curator](curator/README.md) | Supports use of [CogComp NLP Curator](http://cogcomp.cs.illinois.edu/page/software_view/Curator), a tool to run NLP applications as services. |
| [edison](edison/README.md) | A library for feature extraction from `core-utilities` data structures. |
| [lemmatizer](lemmatizer/README.md) | An application that uses [WordNet](https://wordnet.princeton.edu/) and simple rules to find the root forms of words in plain text. |
| [tokenizer](tokenizer/README.md) | An application that identifies sentence and word boundaries in plain text. |
| [tokenizer](tokenizer/README.md) | An application that identifies sentence and word boundaries in plain text. |
| [transliteration](transliteration/README.md) | An application that transliterates names between different scripts. |
| [pos](pos/README.md) | An application that identifies the part of speech (e.g. verb + tense, noun + number) of each word in plain text. |
| [ner](ner/README.md) | An application that identifies named entities in plain text according to two different sets of categories. |
| [md](md/README.md) | An application that identifies entity mentions in plain text. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Properties;
import java.util.Set;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ public class ViewNames {
public static final String POST_ERE = "POST_ERE";
public static final String EVENT_ERE = "EVENT_ERE";

public static final String TRANSLITERATION = "TRANSLITERATION";

public static ViewTypes getViewType(String viewName) {
switch (viewName) {
Expand Down
5 changes: 5 additions & 0 deletions pipeline/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@
<artifactId>illinois-time</artifactId>
<version>3.1.33</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-transliteration</artifactId>
<version>3.1.33</version>
</dependency>

</dependencies>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public class PipelineConfigurator extends AnnotatorServiceConfigurator {
public static final Property USE_QUANTIFIER = new Property("useQuantifier", FALSE);
public static final Property USE_VERB_SENSE = new Property("useVerbSense", FALSE);
public static final Property USE_JSON = new Property("useJson", FALSE);
public static final Property USE_TRANSLITERATION = new Property("useTransliteration", FALSE);
public static final Property USE_MENTION = new Property("useMention", FALSE);
public static final Property USE_LAZY_INITIALIZATION = new Property(
AnnotatorConfigurator.IS_LAZILY_INITIALIZED.key, TRUE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ public static BasicAnnotatorService buildPipeline(Boolean disableCache, String..
nonDefaultValues.put(PipelineConfigurator.USE_VERB_SENSE.key,
Configurator.TRUE);
break;
case ViewNames.TRANSLITERATION:
nonDefaultValues.put(PipelineConfigurator.USE_TRANSLITERATION.key,
Configurator.TRUE);
break;
case ViewNames.TIMEX3:
nonDefaultValues.put(PipelineConfigurator.USE_TIMEX3.key,
Configurator.TRUE);
Expand Down Expand Up @@ -346,6 +350,11 @@ private static Map<String, Annotator> buildAnnotators(ResourceManager nonDefault
viewGenerators.put(ViewNames.QUANTITIES, quantifierAnnotator);
}

if (rm.getBoolean(PipelineConfigurator.USE_TRANSLITERATION)) {
TransliterationAnnotator transliterationAnnotator = new TransliterationAnnotator();
viewGenerators.put(ViewNames.TRANSLITERATION, transliterationAnnotator);
}

if (rm.getBoolean(PipelineConfigurator.USE_SRL_PREP)) {
PrepSRLAnnotator prepSRLAnnotator = new PrepSRLAnnotator();
viewGenerators.put(ViewNames.SRL_PREP, prepSRLAnnotator);
Expand Down
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
<modules>
<module>core-utilities</module>
<module>tokenizer</module>
<module>transliteration</module>
<module>lemmatizer</module>
<module>edison</module>
<module>curator</module>
Expand Down
60 changes: 60 additions & 0 deletions transliteration/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Transliteration

This is a Java port of Jeff Pasternack's C# code from [Learning Better Transliterations](http://cogcomp.org/page/publication_view/205)

See examples in [TestTransliteration](src/test/java/edu/illinois/cs/cogcomp/transliteration/TestTransliteration.java)
or [Runner](src/main/java/edu/illinois/cs/cogcomp/transliteration/Runner.java).


## Training data

To train a model, you need pairs of names. A common source is Wikipedia interlanguage links. For example,
see [this data](http://www.clsp.jhu.edu/~anni/data/wikipedia_names)
from [Transliterating From All Languages](http://cis.upenn.edu/~ccb/publications/transliterating-from-all-languages.pdf)
by Anne Irvine et al.

The standard data format expected is:
```bash
foreign<tab>english
```

That said, the [Utils class](src/main/java/edu/illinois/cs/cogcomp/utils/Utils.java) has readers for many
different datasets (including Anne Irvine's data).

## Training a model
The standard class is the [SPModel](src/main/java/edu/illinois/cs/cogcomp/transliteration/SPModel.java). Use it
as follows:

```java
List<Example> training = Utils.readWikiData(trainfile);
SPModel model = new SPModel(training);
model.Train(10);
model.WriteProbs(modelfile);

```

This will train a model, and write it to the path specified by `modelfile`.

`SPModel` has another useful function called `Probability(source, target)`, which will return the transliteration probability
of a given pair.

## Annotating
A trained model can be used immediately after training, or you can initialize `SPModel` using a
previously trained and saved `modelfile`.

```java
SPModel model = new SPModel(modelfile);
model.setMaxCandidates(10);
TopList<Double,String> predictions = model.Generate(testexample);
```

We limited the max number of candidates to 10, so `predictions` will have at most 10 elements. These
are sorted by score, highest to lowest, where the first element is the best.

## Interactive

Once you have trained a model, it is often helpful to try interacting with it. Use [interactive.sh](scripts/interactive.sh)
for this:
```bash
$ ./scripts/interactive.sh models/modelfile
```
3 changes: 3 additions & 0 deletions transliteration/config/project.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Use ResourceManager to read these properties
CuratorHost = trollope.cs.illinois.edu
CuratorPort = 9010
134 changes: 134 additions & 0 deletions transliteration/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd
http://www.w3.org/2001/XMLSchema-instance ">
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.33</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<artifactId>illinois-transliteration</artifactId>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>

<repositories>
<repository>
<id>CogcompSoftware</id>
<name>CogcompSoftware</name>
<url>http://cogcomp.cs.illinois.edu/m2repo/</url>
</repository>
</repositories>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.33</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.13</version>
</dependency>

<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>56.1</version>
</dependency>

<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-abstract-server</artifactId>
<version>0.1</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>curator-interfaces</artifactId>
<version>0.7</version>
</dependency>

<dependency>
<groupId>org.apache.thrift</groupId>
<artifactId>libthrift</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>curator-utils</artifactId>
<version>0.0.4-SNAPSHOT</version>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.0.2</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.1.2</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.4</version>
</extension>
</extensions>
</build>

</project>
8 changes: 8 additions & 0 deletions transliteration/scripts/interactive.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh

cpath="target/classes:target/dependency/*:config"
MODEL=$1

CMD="java -classpath ${cpath} -Xmx8g edu.illinois.cs.cogcomp.transliteration.Interactive $MODEL"
echo "Running: $CMD"
${CMD}
53 changes: 53 additions & 0 deletions transliteration/scripts/release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

if [ "$#" -ne 1 ]; then
echo "usage: $0 <package-name>"
exit
fi

# Get the current version
VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -v INFO`

## DON'T FORGET TO CHANGE VERSION IF THIS IS A NEW RELEASE!!!
PACKAGE_NAME=$1

echo "The script should run the following commands for package: ${PACKAGE_NAME}-${VERSION}"

## Deploy the Maven release
echo "mvn javadoc:jar deploy"

## Update the GitLab repository (also create a tag)
echo "git tag v${VERSION} -m \"Releasing ${PACKAGE_NAME}-${VERSION}\""

echo "git push --tags"


## Generate the distribution package
echo -n "Generating the distribution package ..."

## Create a temporary directory
TEMP_DIR="temp90614"
PACKAGE_DIR="${TEMP_DIR}/${PACKAGE_NAME}-${VERSION}"

mvn dependency:copy-dependencies

mkdir -p ${PACKAGE_DIR}
mkdir ${PACKAGE_DIR}/lib
mkdir ${PACKAGE_DIR}/dist
mkdir -p ${PACKAGE_DIR}/doc/javadoc
mkdir ${PACKAGE_DIR}/src
mkdir ${PACKAGE_DIR}/scripts

mv target/${PACKAGE_NAME}-${VERSION}.jar ${PACKAGE_DIR}/dist/
mv target/${PACKAGE_NAME}-${VERSION}-sources.jar ${PACKAGE_DIR}/src/
unzip target/${PACKAGE_NAME}-${VERSION}-javadoc.jar -d ${PACKAGE_DIR}/doc/javadoc
mv target/dependency/* ${PACKAGE_DIR}/lib/
cp doc/* ${PACKAGE_DIR}/doc
cp scripts/* ${PACKAGE_DIR}/scripts

cd ${TEMP_DIR}
zip -r ../${PACKAGE_NAME}.zip ${PACKAGE_NAME}-${VERSION}
cd ..

rm -rf ${TEMP_DIR}
echo "Distribution package created: ${PACKAGE_NAME}.zip"
10 changes: 10 additions & 0 deletions transliteration/scripts/runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/sh

cpath="target/classes:target/dependency/*:config"
DIR="/path/to/transliteration/data"
TRAIN=$DIR/train.data
TEST=$DIR/test.data

CMD="java -classpath ${cpath} -Xmx8g edu.illinois.cs.cogcomp.transliteration.Runner $TRAIN $TEST"
echo "Running: $CMD"
${CMD}
Loading

0 comments on commit a4458ec

Please sign in to comment.