Skip to content

Commit

Permalink
Initial.
Browse files Browse the repository at this point in the history
  • Loading branch information
dweiss committed May 29, 2011
0 parents commit ddce802
Show file tree
Hide file tree
Showing 8 changed files with 158,777 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .classpath
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="lib" path="lib/jWordSplitter.jar"/>
<classpathentry kind="lib" path="lib/morfologik-stemming-nodict-1.4.0.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="output" path="bin"/>
</classpath>
17 changes: 17 additions & 0 deletions .project
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>compound-splitter</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
Binary file added lib/jWordSplitter.jar
Binary file not shown.
Binary file added lib/morfologik-stemming-nodict-1.4.0.jar
Binary file not shown.
90 changes: 90 additions & 0 deletions src/SplitterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import de.abelssoft.wordtools.jWordSplitter.impl.GermanWordSplitter;

public class SplitterTest
{
public static interface Decompounder
{
public String split(String in);
}

public static class JWordSplitterDecompounder implements Decompounder
{
private GermanWordSplitter wordSplitter;

JWordSplitterDecompounder()
{
try
{
wordSplitter = new GermanWordSplitter(false);
wordSplitter.setStrictMode(true);
wordSplitter.setMinimumWordLength(3);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}

@Override
public String split(String in)
{
StringBuilder b = new StringBuilder();
for (String s : wordSplitter.splitWord(in))
{
if (b.length() > 0) b.append("+");
b.append(s);
}

return b.toString();
}
}

public static void main(String [] args) throws Exception
{
Decompounder decompounder = new JWordSplitterDecompounder();

ClassLoader cl = Thread.currentThread().getContextClassLoader();
BufferedReader reader = new BufferedReader(new InputStreamReader(
cl.getResourceAsStream("ccorpus.txt"), "UTF-8"));

int instances = 0;
int correct = 0;

String line;
while ((line = reader.readLine()) != null)
{
String [] parts = line.split("\\s");
String compound = parts[0];

String decomposed = parts[1];
decomposed = decomposed.replaceAll("\\{[^\\}]+\\}", "");
decomposed = decomposed.replaceAll("[\\(\\)]", "");
decomposed = decomposed.replaceAll("\\,[a-z]+", "");
decomposed = decomposed.replace("|", "");
decomposed = decomposed.replace("U", "ü");
decomposed = decomposed.replace("A", "ä");

// mediengestalter
// minimal+ausführung minima+laus+führung
String result = decompounder.split(compound);

instances++;
if (result.equals(decomposed))
{
correct++;
}
else
{
System.out.println(decomposed + " " + result);
}
}

System.out.println("Instances: " + instances);
System.out.println("Correct: " + correct + " (" + (correct * 100.0 / instances)
+ "%)");
}
}
Loading

0 comments on commit ddce802

Please sign in to comment.