-
Notifications
You must be signed in to change notification settings - Fork 29
Candidate Term Extraction
JATE2 supports three types of term candidate extraction including Part-of-Speech (PoS) pattern based, Noun Phrase (NP) chunking based and N-gram based. The core idea of JATE2 is to integrate candidate extraction with Solr at index-time. To support the query of frequency information of all the terminology units (particularly for Multi-Word Terms) required by various statistical ranking algorithms, we need two fields. The first field jate_ngraminfo
indexes all n-grams from the text corpus, also their offsets, positions, and other metadata required by various ATE methods. The second field jate_cterm
indexes term candidates, the definition of which is always application dependent (e.g., noun phrase, PoS sequence patterns). These only save the terminology lexical units, whose statistical and metadata information must be looked up from the jate_ngraminfo
field. Therefore, jate_ngraminfo
is configured to ensure that the range of n-grams extracted will cover all those candidate terms from jate_cterm
.
So, any ATE setting needs these two fields. The following examples show the default/demo settings for jate_ngraminfo
and jate_cterm
fields to support three types of candidate term extraction.
<!-- Field to index text with n-gram tokens. These are used as a field to lookup information
including frequency, offsets, etc. for candidate terms from the candidate term's field
(default=jate_cterms). Must be indexed, termVectors and termOffsets set to true-->
<field name="jate_ngraminfo" type="jate_text_2_ngrams" indexed="true" stored="false" multiValued="false"
termVectors="true" termPositions="true" termOffsets="true"
termPayloads="true"/>
<fieldType name="jate_text_2_ngrams" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<!-- html unicode character filter for irregular text -->
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0027" replacement=" ' " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000a" replacement=" \\n " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0009" replacement=" \\t " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0008" replacement=" \\b " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000d" replacement=" \\r " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000c" replacement=" \\f " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0022" replacement=" " " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u005c" replacement=" \\ " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003c" replacement=" < " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003e" replacement=" > " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003d" replacement=" = " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0026" replacement=" & " />
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="org.apache.lucene.analysis.jate.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-token.bin"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPPOSTaggerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="en-pos-maxent.bin"/>
<filter class="org.apache.lucene.analysis.jate.ComplexShingleFilterFactory" minTokens="2" maxTokens="6"
maxCharLength="40" minCharLength="2" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stripAnySymbolChars="false"
stripLeadingSymbolChars="true" stripTrailingSymbolChars="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"
outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.PunctuationRemoverFactory" stripAnySymbols="false"
stripLeadingSymbols="true" stripTrailingSymbols="true"/>
<filter class="org.apache.lucene.analysis.jate.EnglishLemmatisationFilterFactory"
lemmaResourceDir="lemmatiser"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>
</analyzer>
</fieldType>
Three types of candidate term extraction can be supported.
<!-- Field to index text with candidate terms. Must be indexed, and termVectors set to true-->
<field name="jate_cterms" type="jate_text_2_terms" indexed="true" stored="false" multiValued="false"
termVectors="true"/>
<!--a configuration for PoS based candidate extraction-->
<fieldType name="jate_text_2_terms" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<!-- html unicode character filter for irregular text -->
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0027" replacement=" ' " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000a" replacement=" \\n " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0009" replacement=" \\t " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0008" replacement=" \\b " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000d" replacement=" \\r " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000c" replacement=" \\f " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0022" replacement=" " " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u005c" replacement=" \\ " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003c" replacement=" < " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003e" replacement=" > " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003d" replacement=" = " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0026" replacement=" & " />
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="org.apache.lucene.analysis.jate.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-token.bin"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPPOSTaggerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="en-pos-maxent.bin"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPRegexChunkerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="en-pos-maxent.bin"
patterns="aclrdtec.patterns"
minTokens="1" maxTokens="5"
maxCharLength="40" minCharLength="1" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stripAnySymbolChars="false"
stripLeadingSymbolChars="true" stripTrailingSymbolChars="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="org.apache.lucene.analysis.jate.EnglishLemmatisationFilterFactory"
lemmaResourceDir="lemmatiser"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
</analyzer>
</fieldType>
<!-- Field to index text with candidate terms. Must be indexed, and termVectors set to true-->
<field name="jate_cterms" type="jate_text_2_terms" indexed="true" stored="false" multiValued="false"
termVectors="true"/>
<!--a configuration for NP Chunking based candidate extraction-->
<fieldType name="jate_text_2_terms" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0027" replacement=" ' " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000a" replacement=" \\n " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0009" replacement=" \\t " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0008" replacement=" \\b " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000d" replacement=" \\r " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000c" replacement=" \\f " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0022" replacement=" " " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u005c" replacement=" \\ " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003c" replacement=" < " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003e" replacement=" > " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003d" replacement=" = " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0026" replacement=" & " />
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="org.apache.lucene.analysis.jate.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-token.bin"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPPOSTaggerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="en-pos-maxent.bin"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPNounPhraseFilterFactory"
chunkerModel="en-chunker.bin"
minTokens="1" maxTokens="5"
maxCharLength="40" minCharLength="1" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.PunctuationRemoverFactory" stripAnySymbols="false"
stripLeadingSymbols="true" stripTrailingSymbols="true"/>
<filter class="org.apache.lucene.analysis.jate.EnglishLemmatisationFilterFactory"
lemmaResourceDir="lemmatiser"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>
</analyzer>
</fieldType>
<!-- Field to index text with candidate terms. Must be indexed, and termVectors set to true-->
<field name="jate_cterms" type="jate_text_2_terms" indexed="true" stored="false" multiValued="false"
termVectors="true"/>
<fieldType name="jate_text_2_terms" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0027" replacement=" ' " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000a" replacement=" \\n " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0009" replacement=" \\t " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0008" replacement=" \\b " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000d" replacement=" \\r " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u000c" replacement=" \\f " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0022" replacement=" " " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u005c" replacement=" \\ " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003c" replacement=" < " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003e" replacement=" > " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u003d" replacement=" = " />
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\u0026" replacement=" & " />
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPPOSTaggerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.ComplexShingleFilterFactory" minTokens="2" maxTokens="5"
maxCharLength="50" minCharLength="2" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stripAnySymbolChars="false"
stripLeadingSymbolChars="true" stripTrailingSymbolChars="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"
outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
<filter class="org.apache.lucene.analysis.jate.EnglishLemmatisationFilterFactory"
lemmaResourceDir="lemmatiser"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>
</analyzer>
</fieldType>