Skip to content

Commit

Permalink
Add API to give preanalysed sentences in an already parsed native typ…
Browse files Browse the repository at this point in the history
…e format

Thx to TeMPuS
  • Loading branch information
dlazesz committed Nov 5, 2018
1 parent d964cb8 commit 997e9e7
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 41 deletions.
7 changes: 7 additions & 0 deletions src/main/java/hu/ppke/itk/nlpg/purepos/ITagger.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@
package hu.ppke.itk.nlpg.purepos;

import hu.ppke.itk.nlpg.docmodel.ISentence;
import hu.ppke.itk.nlpg.purepos.common.TAnalysisItem;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import org.apache.commons.lang3.tuple.Pair;

/**
* Interface for a POS tagger implementation
Expand All @@ -46,6 +49,10 @@ public List<ISentence> tagSentence(List<String> sentence,

public List<ISentence> tagSentence(String sentence, int maxResultsNumber);

public ISentence tagSentenceEx(ArrayList<Pair<String, ArrayList<TAnalysisItem>>> sentence);

public List<ISentence> tagSentenceEx(ArrayList<Pair<String, ArrayList<TAnalysisItem>>> sentence, int maxResultsNumber);

public void tag(Scanner scanner, String inputFormat, PrintStream ps, String outputFormat);

public void tag(Scanner scanner, String inputFormat, PrintStream ps, String outputFormat, int maxResultsNumber);
Expand Down
48 changes: 36 additions & 12 deletions src/main/java/hu/ppke/itk/nlpg/purepos/POSTagger.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import hu.ppke.itk.nlpg.docmodel.internal.Sentence;
import hu.ppke.itk.nlpg.docmodel.internal.Token;
import hu.ppke.itk.nlpg.purepos.common.AnalysisQueue;
import hu.ppke.itk.nlpg.purepos.common.TAnalysisItem;
import hu.ppke.itk.nlpg.purepos.decoder.AbstractDecoder;
import hu.ppke.itk.nlpg.purepos.decoder.BeamSearch;
import hu.ppke.itk.nlpg.purepos.decoder.BeamedViterbi;
Expand Down Expand Up @@ -84,19 +85,24 @@ public POSTagger(final CompiledModel<String, Integer> model,

protected static List<String> preprocessSentence(List<String> sentence, AnalysisQueue analysisQueue) {
analysisQueue.init(sentence.size());
ArrayList<String> ret = new ArrayList<String>(sentence.size());
int i = 0;
for (String word : sentence) {
if (AnalysisQueue.isPreanalysed(word)) {
analysisQueue.addWord(word, i);
ret.add(AnalysisQueue.clean(word));
} else {
ret.add(word);
}
analysisQueue.addWord(word, i);
++i;
}
return analysisQueue.getAllWords();
}

return ret;
protected static List<String> preprocessSentenceEx(ArrayList<Pair<String, ArrayList<TAnalysisItem>>> sentence,
AnalysisQueue analysisQueue) {
analysisQueue.init(sentence.size());
int i = 0;
for (Pair<String, ArrayList<TAnalysisItem>> word : sentence) {
analysisQueue.addWord(word.getLeft(), word.getRight(), i);
++i;
}

return analysisQueue.getAllWords();
}

@Override
Expand All @@ -106,12 +112,11 @@ public ISentence tagSentence(List<String> sentence) {

@Override
public List<ISentence> tagSentence(List<String> sentence, int maxRes) {
sentence = preprocessSentence(sentence, decoder.getUserAnals());
List<Pair<List<Integer>, Double>> tagList = decoder.decode(sentence,
maxRes);
List<String> plain_sent = preprocessSentence(sentence, decoder.getUserAnals());
List<Pair<List<Integer>, Double>> tagList = decoder.decode(plain_sent, maxRes);
List<ISentence> ret = new ArrayList<ISentence>();
for (Pair<List<Integer>, Double> tags : tagList) {
List<IToken> tokens = merge(sentence, tags.getKey());
List<IToken> tokens = merge(plain_sent, tags.getKey());
Sentence sent = new Sentence(tokens);
sent.setScore(tags.getValue());
ret.add(sent);
Expand Down Expand Up @@ -149,6 +154,25 @@ public List<ISentence> tagSentence(String sentence, int maxRes) {
return sents;
}

@Override
public ISentence tagSentenceEx(ArrayList<Pair<String, ArrayList<TAnalysisItem>>> sentence) {
return tagSentenceEx(sentence, 1).get(0);
}

@Override
public List<ISentence> tagSentenceEx(ArrayList<Pair<String, ArrayList<TAnalysisItem>>> sentence, int maxRes) {
List<String> plain_sent = preprocessSentenceEx(sentence, decoder.getUserAnals());
List<Pair<List<Integer>, Double>> tagList = decoder.decode(plain_sent, maxRes);
List<ISentence> ret = new ArrayList<ISentence>();
for (Pair<List<Integer>, Double> tags : tagList) {
List<IToken> tokens = merge(plain_sent, tags.getKey());
Sentence sent = new Sentence(tokens);
sent.setScore(tags.getValue());
ret.add(sent);
}
return ret;
}

@Override
public void tag(Scanner scanner, String inputFormat, PrintStream ps, String outputFormat) {
tag(scanner, inputFormat, ps, outputFormat, 1);
Expand Down
67 changes: 42 additions & 25 deletions src/main/java/hu/ppke/itk/nlpg/purepos/common/AnalysisQueue.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@
*/
public class AnalysisQueue {
// (position, (tag+stem, prob)
protected ArrayList<Map<String, Double>> anals;
protected List<List<TAnalysisItem>> anals;
// (tag, (stem, prob))
// protected ArrayList<Map<String, Double>> stems;
// if the input contains probability information
protected ArrayList<Boolean> useProb;
protected ArrayList<String> words;
protected List<Boolean> useProb;
protected List<String> words, allWords;

// protected static String alnumPat = "\\p{L}\\p{N}";
// protected static String punctPat = "\\p{P}";
Expand All @@ -69,10 +69,11 @@ public class AnalysisQueue {
// + "\\{\\{(" + analPat + "(\\|\\|" + analPat + ")*" + ")\\}\\}");

public void init(int capacity) {
anals = new ArrayList<Map<String, Double>>(capacity);
anals = new ArrayList<List<TAnalysisItem>>(capacity);
// stems = new ArrayList<Map<String, Double>>(capacity);
useProb = new ArrayList<Boolean>(capacity);
words = new ArrayList<String>(capacity);
allWords = new ArrayList<String>(capacity);
for (int i = 0; i < capacity; ++i) {
anals.add(null);
// stems.add(null);
Expand All @@ -82,14 +83,19 @@ public void init(int capacity) {
}

public void addWord(String input, Integer position) {
if (!isPreanalysed(input)) {
allWords.add(input);
return;
}

Pair<String, List<String>> res = parse(input);
String word = res.getLeft();
List<String> analsList = res.getRight();
words.set(position, res.getLeft());
allWords.add(res.getLeft());

words.set(position, word);
anals.set(position, new HashMap<String, Double>());
List<TAnalysisItem> anals1 = new ArrayList<TAnalysisItem>();
anals.set(position, anals1);

for (String anal : analsList) {
for (String anal : res.getRight()) {
int indexOfValSep = anal.indexOf("$$");
String lemmaTag = anal;
double prob = 1;
Expand All @@ -98,18 +104,28 @@ public void addWord(String input, Integer position) {
prob = Double.parseDouble(anal.substring(indexOfValSep + 2));
lemmaTag = anal.substring(0, indexOfValSep);
}
anals.get(position).put(lemmaTag, prob);

anals1.add(new TAnalysisItem(anal2lemma(lemmaTag), anal2tag(lemmaTag), prob));
}

}

public void addWord(String word, ArrayList<TAnalysisItem> analsList, Integer position) {
allWords.add(word);
if (analsList.isEmpty()) return;
words.set(position, word);
anals.set(position, analsList);
}

public boolean hasAnal(Integer position) {
return anals.size() > position && anals.get(position) != null;
}

public Map<String, Double> getAnals(Integer position) {
return anals.get(position);
Map<String, Double> ret = new HashMap<String, Double>();
for (TAnalysisItem entry : anals.get(position)) {
ret.put(entry.getLemma()+entry.getTag(), entry.getProb());
}
return ret;
}

public boolean useProbabilties(Integer position) {
Expand All @@ -128,19 +144,21 @@ public IProbabilityModel<Integer, String> getLexicalModelForWord(
protected Map<Integer, Double> transformTags(Integer position,
IVocabulary<String, Integer> tagVocabulary) {
Map<Integer, Double> retMap = new HashMap<Integer, Double>();
for (Map.Entry<String, Double> entry : this.anals.get(position)
.entrySet()) {
String tagStr = anal2tag(entry.getKey());
Integer tag = tagVocabulary.getIndex(tagStr);
for (TAnalysisItem entry : anals.get(position)) {
Integer tag = tagVocabulary.getIndex(entry.getTag());
if (tag == null) {
tag = tagVocabulary.addElement(tagStr);
tag = tagVocabulary.addElement(entry.getTag());
}
retMap.put(tag, entry.getValue());
retMap.put(tag, entry.getProb());

}
return retMap;
}

public List<String> getAllWords() {
return allWords;
}

public Set<Integer> getTags(Integer position,
IVocabulary<String, Integer> tagVocabulary) {
Map<Integer, Double> retMap = transformTags(position, tagVocabulary);
Expand All @@ -149,15 +167,14 @@ public Set<Integer> getTags(Integer position,
}

public Set<IToken> getAnalysises(Integer position) {
Set<String> fanals = anals.get(position).keySet();
Set<IToken> ret = new HashSet<IToken>();
for (String fa : fanals) {
ret.add(new Token(words.get(position), anal2lemma(fa), anal2tag(fa)));
for (TAnalysisItem entry : anals.get(position)) {
ret.add(new Token(words.get(position), entry.getLemma(), entry.getTag()));
}
return ret;
}

public static Pair<String, List<String>> parse(String token) {
protected static Pair<String, List<String>> parse(String token) {
int wordRB = token.indexOf("{{");
int analRB = token.indexOf("}}");
String word = token.substring(0, wordRB);
Expand All @@ -170,15 +187,15 @@ public static boolean isPreanalysed(String word) {
return word.indexOf("{{") > 0 && word.lastIndexOf("}}") > 0;
}

public static String clean(String word) {
protected static String clean(String word) {
return word.substring(0, word.indexOf("{{"));
}

public static String anal2tag(String anal) {
protected static String anal2tag(String anal) {
return anal.substring(anal.indexOf("["));
}

public static String anal2lemma(String anal) {
protected static String anal2lemma(String anal) {
return anal.substring(0, anal.indexOf("["));
}
}
29 changes: 29 additions & 0 deletions src/main/java/hu/ppke/itk/nlpg/purepos/common/TAnalysisItem.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package hu.ppke.itk.nlpg.purepos.common;

public class TAnalysisItem {
protected String lemma, tag;
protected Double prob;

public TAnalysisItem(String lemma, String ana, Double prob) {
this.lemma = lemma;
this.tag = ana;
this.prob = prob;
}

public static TAnalysisItem create(String lemma, String ana) {
Double prob = 1.0;
return new TAnalysisItem(lemma, ana, prob);
}

public String getLemma() {
return lemma;
}

public String getTag() {
return tag;
}

public Double getProb() {
return prob;
}
}
29 changes: 28 additions & 1 deletion src/test/java/hu/ppke/itk/nlpg/purepos/POSTaggerTest.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package hu.ppke.itk.nlpg.purepos;

import hu.ppke.itk.nlpg.purepos.common.AnalysisQueue;
import hu.ppke.itk.nlpg.purepos.common.TAnalysisItem;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import junit.framework.Assert;

import org.apache.commons.lang3.tuple.Pair;
import org.junit.Test;

public class POSTaggerTest {
Expand All @@ -24,4 +26,29 @@ public void testPreprocessSentence() {
Assert.assertEquals(true, analysisQueue.hasAnal(2));
Assert.assertEquals(false, analysisQueue.hasAnal(1));
}

@Test
public void testPreprocessSentenceEx() {
ArrayList<Pair<String, ArrayList<TAnalysisItem>>> inputEx;
inputEx = new ArrayList<Pair<String, ArrayList<TAnalysisItem>>>();
ArrayList<TAnalysisItem> a;
a = new ArrayList<TAnalysisItem>();
inputEx.add(Pair.of("A",a));
a = new ArrayList<TAnalysisItem>();
inputEx.add(Pair.of("ház",a));
a = new ArrayList<TAnalysisItem>();
a.add(new TAnalysisItem("alma","[FN][NOM]",0.9));
a.add(new TAnalysisItem("alom","[FN][Pse3]",0.1));
inputEx.add(Pair.of("alma",a));
a = new ArrayList<TAnalysisItem>();
inputEx.add(Pair.of(".",a));
AnalysisQueue analysisQueue = new AnalysisQueue();
List<String> outList = POSTagger.preprocessSentenceEx(inputEx, analysisQueue);

Assert.assertEquals("ház", outList.get(1));
Assert.assertEquals("alma", outList.get(2));

Assert.assertEquals(true, analysisQueue.hasAnal(2));
Assert.assertEquals(false, analysisQueue.hasAnal(1));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ protected File createFile() throws IOException {
+ "<tag_mapping pattern=\"^(.*)(MN|FN)(\\|lat)(.*)$\" to=\"$1FN$4\" />"
+ "</config>";
File f = File.createTempFile("config_test_", ".xml");
PrintStream ps = new PrintStream(f);
PrintStream ps = new PrintStream(f, "UTF-8");
ps.print(out);
ps.close();
return f;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ public void unitTest() {
aq.init(5);

aq.addWord("alma{{alma[FN][NOM]$$0.9||alom[FN][Pse3]$$0.1}}", 1);
Assert.assertEquals(2, aq.anals.get(1).keySet().size());
Assert.assertEquals(2, aq.anals.get(1).size());
Map<String, Double> anals = aq.getAnals(1);
Assert.assertEquals(true, anals.containsKey("alma[FN][NOM]"));
Assert.assertEquals(true, anals.containsKey("alom[FN][Pse3]"));
Expand All @@ -112,7 +112,7 @@ public void unitTest() {
Assert.assertEquals(true, aq.useProbabilties(1));

aq.addWord("alma{{alma[FN][NOM]||alom[FN][Pse3]}}", 0);
Assert.assertEquals(2, aq.anals.get(0).keySet().size());
Assert.assertEquals(2, aq.anals.get(0).size());
anals = aq.getAnals(0);
Assert.assertEquals(true, anals.containsKey("alma[FN][NOM]"));
Assert.assertEquals(true, anals.containsKey("alom[FN][Pse3]"));
Expand Down

0 comments on commit 997e9e7

Please sign in to comment.