Merge pull request dweiss#1 from wayfair/master

Add/modify classes to adapt compound-splitter for use as a Lucene/Solr p...
DiceHoldingsInc · Aug 16, 2013 · ebaa47f · ebaa47f
2 parents 85db804 + 4ca91e4
commit ebaa47f
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 49 deletions.
diff --git a/src/eclipse/CompileCompoundDictionaries.launch b/src/eclipse/CompileCompoundDictionaries.launch
@@ -7,8 +7,10 @@
 <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
 <listEntry value="1"/>
 </listAttribute>
+<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/>
 <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.lucene.analysis.de.compounds.CompileCompoundDictionaries"/>
 <stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="src/data/morphy.txt&#10;src/data/morphy-unknown.txt"/>
 <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="compound-splitter"/>
+<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
 <stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/>
 </launchConfiguration>
diff --git a/src/main/java/org/apache/lucene/analysis/de/compounds/CompileCompoundDictionaries.java b/src/main/java/org/apache/lucene/analysis/de/compounds/CompileCompoundDictionaries.java
@@ -16,27 +16,33 @@
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FST.INPUT_TYPE;
 import org.apache.lucene.util.fst.NoOutputs;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 
 /**
  * Compile an FSA from an UTF-8 text file (must be properly sorted).
  */
 public class CompileCompoundDictionaries
 {
-    public static void main(String [] args) throws Exception
+  private static Logger log = LoggerFactory
+      .getLogger(CompileCompoundDictionaries.class);
+  private static String dataDir;
+
+  public static void setDataDir(String dir) {
+    dataDir = dir;
+  }
+
+  public static void compile(String[] vocabFiles)
+      throws Exception
     {
-        if (args.length < 1)
-        {
-            System.out.println("Args: input1.txt input2.txt ...");
-            System.exit(-1);
-        }
 
         final HashSet<BytesRef> words = new HashSet<BytesRef>();
-        for (int i = 0; i < args.length; i++)
+    for (int i = 0; i < vocabFiles.length; i++)
         {
             int count = 0;
             BufferedReader reader = new BufferedReader(new InputStreamReader(
-                new FileInputStream(args[i]), "UTF-8"));
+          new FileInputStream(vocabFiles[i]), "UTF-8"));
 
             Pattern pattern = Pattern.compile("\\s+");
             String line, last = null;
@@ -67,18 +73,20 @@ public static void main(String [] args) throws Exception
                 buffer.setLength(len);
                 buffer.reverse().append(GermanCompoundSplitter.RTL_SYMBOL);
                 words.add(new BytesRef(buffer));
-                if ((++count % 100000) == 0) System.err.println("Line: " + count);
+        if ((++count % 100000) == 0) log.info("Line: " + count);
             }
             reader.close();
 
-            System.out.println(String.format("%s, words: %d", args[i], count));
+      log.info("{}, words: {}", vocabFiles[i], count);
         }
 
         final BytesRef [] all = new BytesRef [words.size()];
         words.toArray(all);
 
         Arrays.sort(all, BytesRef.getUTF8SortedAsUnicodeComparator());
-        serialize("src/main/resources/words.fst", all);
+    serialize(
+dataDir + "words.fst",
+        all);
     }
 
     private static void serialize(String file, BytesRef [] all) throws IOException

diff --git a/src/main/java/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitter.java b/src/main/java/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitter.java
@@ -2,13 +2,15 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.FileInputStream;
 import java.util.*;
 
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.*;
 import org.apache.lucene.util.fst.FST.INPUT_TYPE;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Simple greedy compound splitter for German. Objects of this class are <b>not thread
@@ -26,18 +28,20 @@ public class GermanCompoundSplitter
      * ?MenuId=WordFormation115012
      */
 
+    private static Logger log = LoggerFactory
+            .getLogger(GermanCompoundSplitter.class);
     /**
      * A static FSA with inflected and base surface forms from Morphy.
      * 
      * @see "http://www.wolfganglezius.de/doku.php?id=cl:surfaceForms"
      */
-    private final static FST<Object> surfaceForms;
+    private static FST<Object> surfaceForms;
 
     /**
      * A static FSA with glue glueMorphemes. This could be merged into a single FSA
      * together with {@link #surfaceForms}, but I leave it separate for now.
      */
-    private final static FST<Object> glueMorphemes;
+    private static FST<Object> glueMorphemes;
 
     /**
      * left-to-right word encoding symbol (FST).
@@ -49,21 +53,7 @@ public class GermanCompoundSplitter
      */
     static final char RTL_SYMBOL = '<';
 
-    /**
-     * Load and initialize static data structures.
-     */
-    static
-    {
-        try
-        {
-            surfaceForms = readMorphyFST();
-            glueMorphemes = createMorphemesFST();
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException("Failed to initialize static data structures.", e);
-        }
-    }
+
 
     /**
      * Category for a given chunk of a compound.
@@ -140,6 +130,25 @@ public static interface DecompositionListener
      */
     private final StringBuilder builder = new StringBuilder();
 
+
+    /**
+     * Load and initialize static data structures.
+     */
+    public static void initFSTs(String fstFile)
+    {
+        try
+        {
+            surfaceForms = readMorphyFST(fstFile);
+            glueMorphemes = createMorphemesFST();
+        }
+        catch (IOException e)
+        {
+            log.error(e.getMessage()
+                    + "    Failed to initialize static data structures for German compound splitter.");
+        }
+    }
+
+
     /**
      * Splits the input sequence of characters into separate words if this sequence is
      * potentially a compound word.
@@ -195,7 +204,8 @@ public void decomposition(IntsRef utf32, ArrayDeque<Chunk> chunks)
         catch (IOException e)
         {
             // Shouldn't happen, but just in case.
-            throw new RuntimeException(e);
+            log.error(e.getMessage());
+            return null;
         }
     }
 
@@ -212,10 +222,12 @@ private void matchWord(IntsRef utf32, int offset) throws IOException
         {
             int chr = utf32.ints[i];
 
-            arc = surfaceForms.findTargetArc(chr, arc, arc);
+            arc = surfaceForms.findTargetArc(chr, arc, arc,
+                    surfaceForms.getBytesReader());
             if (arc == null) break;
 
-            if (surfaceForms.findTargetArc(RTL_SYMBOL, arc, scratch) != null)
+            if (surfaceForms.findTargetArc(RTL_SYMBOL, arc, scratch,
+                    surfaceForms.getBytesReader()) != null)
             {
                 Chunk ch = new Chunk(offset, i + 1, ChunkType.WORD);
                 wordsFromHere.add(ch);
@@ -257,7 +269,8 @@ private void matchGlueMorpheme(IntsRef utf32, final int offset) throws IOExcepti
         {
             int chr = utf32.ints[i];
 
-            arc = glueMorphemes.findTargetArc(chr, arc, arc);
+            arc = glueMorphemes.findTargetArc(chr, arc, arc,
+                    glueMorphemes.getBytesReader());
             if (arc == null) break;
 
             if (arc.isFinal())
@@ -296,29 +309,21 @@ private static IntsRef UTF16ToUTF32(CharSequence s, IntsRef scratchIntsRef)
     /**
      * Load surface forms FST.
      */
-    private static FST<Object> readMorphyFST()
+    private static FST<Object> readMorphyFST(String fstFile) throws IOException
     {
-        try
-        {
-            final InputStream is = 
-                GermanCompoundSplitter.class.getClassLoader().getResourceAsStream("words.fst");
-            final FST<Object> fst = new FST<Object>(new InputStreamDataInput(is),
+        final InputStream is = new FileInputStream(fstFile);
+        final FST<Object> fst = new FST<Object>(new InputStreamDataInput(is),
                 NoOutputs.getSingleton());
-            is.close();
-            return fst;
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
+        is.close();
+        return fst;
     }
 
     /**
      * Create glue morphemes FST.
      */
     private static FST<Object> createMorphemesFST() throws IOException
     {
-        String [] morphemes =
+        String[] morphemes =
         {
             "e", "es", "en", "er", "n", "ens", "ns", "s"
         };
@@ -334,10 +339,18 @@ private static FST<Object> createMorphemesFST() throws IOException
         final Builder<Object> builder = new Builder<Object>(INPUT_TYPE.BYTE4,
             NoOutputs.getSingleton());
         final Object nothing = NoOutputs.getSingleton().getNoOutput();
+
         for (String morpheme : morphemes)
         {
-            builder.add(morpheme, nothing);
+            int[] morphemeCodePoints = new int[morpheme.length()];
+            for (int i = 0; i < morpheme.length(); i++)
+            {
+                morphemeCodePoints[i] = morpheme.codePointAt(i);
+            }
+            builder.add(new IntsRef(morphemeCodePoints, 0, morpheme.length()),
+                    nothing);
         }
         return builder.finish();
     }
 }
+
diff --git a/src/main/java/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitterTokenFilter.java b/src/main/java/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitterTokenFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.de.compounds;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class GermanCompoundSplitterTokenFilter extends
+    CompoundWordTokenFilterBase {
+  private static Logger log = LoggerFactory
+      .getLogger(GermanCompoundSplitterTokenFilter.class);
+
+  private GermanCompoundSplitter splitter;
+
+  public GermanCompoundSplitterTokenFilter(Version matchVersion,
+      TokenStream input, String fstFile) {
+    super(matchVersion, input, null);
+    GermanCompoundSplitter.initFSTs(fstFile);
+    this.splitter = new GermanCompoundSplitter();
+  }
+
+  public void decompose() {
+    String splitWords = new String();
+    String incomingPossibleCompound = termAtt.toString();
+    try {
+      CharSequence sw = splitter.split(incomingPossibleCompound);
+      if (sw != null) {
+        splitWords = sw.toString(); // supplywood
+                                    // ->
+                                    // supply.wood,sup.plywood
+      } else {
+        splitWords = incomingPossibleCompound;
+      }
+      String[] possibleSplits = splitWords.split(",");
+      String[] words = possibleSplits[0].split("\\."); // Take the first
+                                                       // suggestion
+      for (String word : words) {
+        int startInd = incomingPossibleCompound.indexOf(word);
+        int length = word.length();
+        tokens.add(new CompoundToken(startInd, length));
+      }
+    } catch (Exception e) {
+      log.error(e.getMessage());
+    }
+
+  }
+}
diff --git a/...ava/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitterTokenFilterFactory.java b/...ava/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitterTokenFilterFactory.java
@@ -0,0 +1,43 @@
+package org.apache.lucene.analysis.de.compounds;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class GermanCompoundSplitterTokenFilterFactory extends
+    TokenFilterFactory {
+  private static Logger log = LoggerFactory
+      .getLogger(GermanCompoundSplitterTokenFilterFactory.class);
+
+  private String dataDir;
+  private String fstFile;
+
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    this.dataDir = args.get("dataDir");
+    this.fstFile = dataDir + "words.fst";
+    String[] inputFiles = {
+dataDir + "morphy.txt",
+        dataDir + "morphy-unknown.txt"};
+    try {
+      Boolean shouldCompileDict = Boolean.parseBoolean(args.get("compileDict"));
+      if (shouldCompileDict) {
+        CompileCompoundDictionaries.setDataDir(dataDir);
+        CompileCompoundDictionaries.compile(inputFiles);
+      }
+    } catch (Exception e) {
+      log.error(e.getMessage());
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new GermanCompoundSplitterTokenFilter(luceneMatchVersion, input,
+        fstFile);
+  }
+}
diff --git a/src/test/java/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitterTest.java b/src/test/java/org/apache/lucene/analysis/de/compounds/GermanCompoundSplitterTest.java
@@ -6,7 +6,6 @@
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.lucene.analysis.de.compounds.GermanCompoundSplitter;
 import org.testng.Assert;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;