Skip to content

Commit

Permalink
Merge pull request dweiss#1 from wayfair/master
Browse files Browse the repository at this point in the history
Add/modify classes to adapt compound-splitter for use as a Lucene/Solr p...
  • Loading branch information
dweiss committed Aug 16, 2013
2 parents 85db804 + 4ca91e4 commit ebaa47f
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 49 deletions.
2 changes: 2 additions & 0 deletions src/eclipse/CompileCompoundDictionaries.launch
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
<listEntry value="1"/>
</listAttribute>
<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/>
<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.lucene.analysis.de.compounds.CompileCompoundDictionaries"/>
<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="src/data/morphy.txt&#10;src/data/morphy-unknown.txt"/>
<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="compound-splitter"/>
<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/>
</launchConfiguration>
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,33 @@
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.NoOutputs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* Compile an FSA from an UTF-8 text file (must be properly sorted).
*/
public class CompileCompoundDictionaries
{
public static void main(String [] args) throws Exception
private static Logger log = LoggerFactory
.getLogger(CompileCompoundDictionaries.class);
private static String dataDir;

public static void setDataDir(String dir) {
dataDir = dir;
}

public static void compile(String[] vocabFiles)
throws Exception
{
if (args.length < 1)
{
System.out.println("Args: input1.txt input2.txt ...");
System.exit(-1);
}

final HashSet<BytesRef> words = new HashSet<BytesRef>();
for (int i = 0; i < args.length; i++)
for (int i = 0; i < vocabFiles.length; i++)
{
int count = 0;
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(args[i]), "UTF-8"));
new FileInputStream(vocabFiles[i]), "UTF-8"));

Pattern pattern = Pattern.compile("\\s+");
String line, last = null;
Expand Down Expand Up @@ -67,18 +73,20 @@ public static void main(String [] args) throws Exception
buffer.setLength(len);
buffer.reverse().append(GermanCompoundSplitter.RTL_SYMBOL);
words.add(new BytesRef(buffer));
if ((++count % 100000) == 0) System.err.println("Line: " + count);
if ((++count % 100000) == 0) log.info("Line: " + count);
}
reader.close();

System.out.println(String.format("%s, words: %d", args[i], count));
log.info("{}, words: {}", vocabFiles[i], count);
}

final BytesRef [] all = new BytesRef [words.size()];
words.toArray(all);

Arrays.sort(all, BytesRef.getUTF8SortedAsUnicodeComparator());
serialize("src/main/resources/words.fst", all);
serialize(
dataDir + "words.fst",
all);
}

private static void serialize(String file, BytesRef [] all) throws IOException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.util.*;

import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.*;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Simple greedy compound splitter for German. Objects of this class are <b>not thread
Expand All @@ -26,18 +28,20 @@ public class GermanCompoundSplitter
* ?MenuId=WordFormation115012
*/

private static Logger log = LoggerFactory
.getLogger(GermanCompoundSplitter.class);
/**
* A static FSA with inflected and base surface forms from Morphy.
*
* @see "http://www.wolfganglezius.de/doku.php?id=cl:surfaceForms"
*/
private final static FST<Object> surfaceForms;
private static FST<Object> surfaceForms;

/**
* A static FSA with glue glueMorphemes. This could be merged into a single FSA
* together with {@link #surfaceForms}, but I leave it separate for now.
*/
private final static FST<Object> glueMorphemes;
private static FST<Object> glueMorphemes;

/**
* left-to-right word encoding symbol (FST).
Expand All @@ -49,21 +53,7 @@ public class GermanCompoundSplitter
*/
static final char RTL_SYMBOL = '<';

/**
* Load and initialize static data structures.
*/
static
{
try
{
surfaceForms = readMorphyFST();
glueMorphemes = createMorphemesFST();
}
catch (IOException e)
{
throw new RuntimeException("Failed to initialize static data structures.", e);
}
}


/**
* Category for a given chunk of a compound.
Expand Down Expand Up @@ -140,6 +130,25 @@ public static interface DecompositionListener
*/
private final StringBuilder builder = new StringBuilder();


/**
* Load and initialize static data structures.
*/
public static void initFSTs(String fstFile)
{
try
{
surfaceForms = readMorphyFST(fstFile);
glueMorphemes = createMorphemesFST();
}
catch (IOException e)
{
log.error(e.getMessage()
+ " Failed to initialize static data structures for German compound splitter.");
}
}


/**
* Splits the input sequence of characters into separate words if this sequence is
* potentially a compound word.
Expand Down Expand Up @@ -195,7 +204,8 @@ public void decomposition(IntsRef utf32, ArrayDeque<Chunk> chunks)
catch (IOException e)
{
// Shouldn't happen, but just in case.
throw new RuntimeException(e);
log.error(e.getMessage());
return null;
}
}

Expand All @@ -212,10 +222,12 @@ private void matchWord(IntsRef utf32, int offset) throws IOException
{
int chr = utf32.ints[i];

arc = surfaceForms.findTargetArc(chr, arc, arc);
arc = surfaceForms.findTargetArc(chr, arc, arc,
surfaceForms.getBytesReader());
if (arc == null) break;

if (surfaceForms.findTargetArc(RTL_SYMBOL, arc, scratch) != null)
if (surfaceForms.findTargetArc(RTL_SYMBOL, arc, scratch,
surfaceForms.getBytesReader()) != null)
{
Chunk ch = new Chunk(offset, i + 1, ChunkType.WORD);
wordsFromHere.add(ch);
Expand Down Expand Up @@ -257,7 +269,8 @@ private void matchGlueMorpheme(IntsRef utf32, final int offset) throws IOExcepti
{
int chr = utf32.ints[i];

arc = glueMorphemes.findTargetArc(chr, arc, arc);
arc = glueMorphemes.findTargetArc(chr, arc, arc,
glueMorphemes.getBytesReader());
if (arc == null) break;

if (arc.isFinal())
Expand Down Expand Up @@ -296,29 +309,21 @@ private static IntsRef UTF16ToUTF32(CharSequence s, IntsRef scratchIntsRef)
/**
* Load surface forms FST.
*/
private static FST<Object> readMorphyFST()
private static FST<Object> readMorphyFST(String fstFile) throws IOException
{
try
{
final InputStream is =
GermanCompoundSplitter.class.getClassLoader().getResourceAsStream("words.fst");
final FST<Object> fst = new FST<Object>(new InputStreamDataInput(is),
final InputStream is = new FileInputStream(fstFile);
final FST<Object> fst = new FST<Object>(new InputStreamDataInput(is),
NoOutputs.getSingleton());
is.close();
return fst;
}
catch (IOException e)
{
throw new RuntimeException(e);
}
is.close();
return fst;
}

/**
* Create glue morphemes FST.
*/
private static FST<Object> createMorphemesFST() throws IOException
{
String [] morphemes =
String[] morphemes =
{
"e", "es", "en", "er", "n", "ens", "ns", "s"
};
Expand All @@ -334,10 +339,18 @@ private static FST<Object> createMorphemesFST() throws IOException
final Builder<Object> builder = new Builder<Object>(INPUT_TYPE.BYTE4,
NoOutputs.getSingleton());
final Object nothing = NoOutputs.getSingleton().getNoOutput();

for (String morpheme : morphemes)
{
builder.add(morpheme, nothing);
int[] morphemeCodePoints = new int[morpheme.length()];
for (int i = 0; i < morpheme.length(); i++)
{
morphemeCodePoints[i] = morpheme.codePointAt(i);
}
builder.add(new IntsRef(morphemeCodePoints, 0, morpheme.length()),
nothing);
}
return builder.finish();
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.apache.lucene.analysis.de.compounds;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GermanCompoundSplitterTokenFilter extends
CompoundWordTokenFilterBase {
private static Logger log = LoggerFactory
.getLogger(GermanCompoundSplitterTokenFilter.class);

private GermanCompoundSplitter splitter;

public GermanCompoundSplitterTokenFilter(Version matchVersion,
TokenStream input, String fstFile) {
super(matchVersion, input, null);
GermanCompoundSplitter.initFSTs(fstFile);
this.splitter = new GermanCompoundSplitter();
}

public void decompose() {
String splitWords = new String();
String incomingPossibleCompound = termAtt.toString();
try {
CharSequence sw = splitter.split(incomingPossibleCompound);
if (sw != null) {
splitWords = sw.toString(); // supplywood
// ->
// supply.wood,sup.plywood
} else {
splitWords = incomingPossibleCompound;
}
String[] possibleSplits = splitWords.split(",");
String[] words = possibleSplits[0].split("\\."); // Take the first
// suggestion
for (String word : words) {
int startInd = incomingPossibleCompound.indexOf(word);
int length = word.length();
tokens.add(new CompoundToken(startInd, length));
}
} catch (Exception e) {
log.error(e.getMessage());
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package org.apache.lucene.analysis.de.compounds;

import java.io.IOException;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GermanCompoundSplitterTokenFilterFactory extends
TokenFilterFactory {
private static Logger log = LoggerFactory
.getLogger(GermanCompoundSplitterTokenFilterFactory.class);

private String dataDir;
private String fstFile;

@Override
public void init(Map<String,String> args) {
super.init(args);
this.dataDir = args.get("dataDir");
this.fstFile = dataDir + "words.fst";
String[] inputFiles = {
dataDir + "morphy.txt",
dataDir + "morphy-unknown.txt"};
try {
Boolean shouldCompileDict = Boolean.parseBoolean(args.get("compileDict"));
if (shouldCompileDict) {
CompileCompoundDictionaries.setDataDir(dataDir);
CompileCompoundDictionaries.compile(inputFiles);
}
} catch (Exception e) {
log.error(e.getMessage());
}
}

@Override
public TokenStream create(TokenStream input) {
return new GermanCompoundSplitterTokenFilter(luceneMatchVersion, input,
fstFile);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.de.compounds.GermanCompoundSplitter;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand Down

0 comments on commit ebaa47f

Please sign in to comment.