diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
index 6356978837..14d468418c 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Table.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}
if (desc != null && config.isWithSentenceSegmentation()) {
- formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+ formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
// we need a sentence segmentation of the table caption, for that we need to introduce
// a
, then a
@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (noteNode != null && config.isWithSentenceSegmentation()) {
// we need a sentence segmentation of the figure caption
- formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
+ formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
// enclose note content in a
element
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 7359577bcf..c50ee73c69 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -28,6 +28,7 @@
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.lang.Language;
import org.grobid.core.layout.*;
+import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.utilities.SentenceUtilities;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
@@ -1320,7 +1321,7 @@ private StringBuilder toTEINote(StringBuilder tei,
if (config.isWithSentenceSegmentation()) {
- segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage());
+ segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
desc.appendChild(pNote);
@@ -1522,7 +1523,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
- segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+ segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
@@ -1550,7 +1551,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
} else {
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
- segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+ segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
@@ -1769,7 +1770,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
// in case we segment paragraph into sentences, we still need to do it for the last paragraph
if (curParagraph != null && config.isWithSentenceSegmentation()) {
- segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+ segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
// remove possibly empty div in the div list
@@ -1836,6 +1837,10 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curP
}
public void segmentIntoSentences(Element curParagraph, List curParagraphTokens, GrobidAnalysisConfig config, String lang) {
+ segmentIntoSentences(curParagraph, curParagraphTokens, config, lang, new ArrayList<>());
+ }
+
+ public void segmentIntoSentences(Element curParagraph, List curParagraphTokens, GrobidAnalysisConfig config, String lang, List annotations) {
// in order to avoid having a sentence boundary in the middle of a ref element
// (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
// we only consider for sentence segmentation texts under and skip the text under [.
@@ -1844,7 +1849,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara
// in xom, the following gives all the text under the element, for the whole subtree
String text = curParagraph.getValue();
- if (text == null || text.length() == 0)
+ if (StringUtils.isEmpty(text))
return;
// identify ref nodes, ref spans and ref positions
@@ -1861,8 +1866,8 @@ public void segmentIntoSentences(Element curParagraph, List curPara
// for readability in another conditional
if (((Element) theNode).getLocalName().equals("ref")) {
// map character offset of the node
- mapRefNodes.put(Integer.valueOf(pos), theNode);
- refPositions.add(Integer.valueOf(pos));
+ mapRefNodes.put(pos, theNode);
+ refPositions.add(pos);
String chunk = theNode.getValue();
forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length()));
@@ -1871,6 +1876,9 @@ public void segmentIntoSentences(Element curParagraph, List curPara
}
}
+ List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
+ forbiddenPositions.addAll(offsetPositionsUrls);
+
List theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
@@ -1893,7 +1901,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara
for(int i=0; i> processShort(List tokens, Do
List currentChunk = new ArrayList<>();
int currentPos = 0;
for(LayoutToken token : tokens) {
- if (currentChunk.size() != 0) {
+ if (CollectionUtils.isNotEmpty(currentChunk)) {
int tokenPos = token.getOffset();
if (currentPos != tokenPos) {
// new chunk
@@ -508,7 +509,7 @@ public Pair> processShort(List tokens, Do
LayoutTokenization layouts = featSeg.getRight();
if (layouts != null)
layoutTokenization = layouts.getTokenization();
- if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
+ if (StringUtils.isNotBlank(featuredText)) {
res = label(featuredText);
res = postProcessFullTextLabeledText(res);
}
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
index c92b270ff1..1068be3e28 100644
--- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
@@ -102,7 +102,7 @@ public MutablePair,List,List,List,List();
@@ -563,7 +565,7 @@ public void initResearchInfrastructures() {
String[] pieces = line.split(";", -1); // -1 for getting empty tokens too
if (pieces.length == 3) {
if (pieces[0].length() > 0) {
-
+
if (pieces[1].length() > 0) {
OrganizationRecord localInfra = new OrganizationRecord(pieces[0], pieces[1], "en");
List localInfraList = researchOrganizations.get(pieces[0].toLowerCase());
@@ -608,7 +610,7 @@ public void initResearchInfrastructures() {
throw new GrobidResourceException("Error when compiling lexicon matcher for research infrastructure.", e);
} catch (Exception e) {
throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
- }
+ }
}
/**
@@ -642,15 +644,15 @@ public boolean isPunctuation(String s) {
public List getOrganizationNamingInfo(String name) {
if (researchOrganizations == null)
return null;
- return researchOrganizations.get(name.toLowerCase());
+ return researchOrganizations.get(name.toLowerCase());
}
/**
* Map the language codes used by the language identifier component to the normal
* language name.
*
- * Note: due to an older bug, kr is currently map to Korean too - this should
- * disappear at some point in the future after retraining of models
+ * Note: due to an older bug, kr is currently map to Korean too - this should
+ * disappear at some point in the future after retraining of models
*
* @param code the language to be mapped
*/
@@ -896,7 +898,7 @@ public List charPositionsOrganisationNames(String s) {
/**
* Soft look-up in organisation names gazetteer for a tokenize sequence.
- * It return a list of positions referring to the character positions within the input
+ * It return a list of positions referring to the character positions within the input
* sequence.
*
* @param s the input list of LayoutToken
@@ -987,7 +989,7 @@ public List tokenPositionsLocationNames(List s) {
}
/**
- * Soft look-up in location name gazetteer for a string, return a list of positions referring
+ * Soft look-up in location name gazetteer for a string, return a list of positions referring
* to the character positions within the string.
*
* For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19)
@@ -1004,7 +1006,7 @@ public List charPositionsLocationNames(String s) {
}
/**
- * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of
+ * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of
* positions referring to the character positions in the input sequence.
*
* For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19)
@@ -1092,7 +1094,7 @@ public List charPositionsPersonTitle(List s) {
public List tokenPositionsIdentifierPattern(List tokens) {
List result = new ArrayList();
String text = LayoutTokensUtil.toText(tokens);
-
+
// DOI positions
result = tokenPositionsDOIPattern(tokens, text);
@@ -1115,10 +1117,10 @@ public List tokenPositionsIdentifierPattern(List to
public List tokenPositionsDOIPattern(List tokens, String text) {
List textResult = new ArrayList();
Matcher doiMatcher = TextUtilities.DOIPattern.matcher(text);
- while (doiMatcher.find()) {
+ while (doiMatcher.find()) {
textResult.add(new OffsetPosition(doiMatcher.start(), doiMatcher.end()));
}
- return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+ return convertStringOffsetToTokenOffset(textResult, tokens);
}
/**
@@ -1128,11 +1130,11 @@ public List tokenPositionsDOIPattern(List tokens, S
public List tokenPositionsArXivPattern(List tokens, String text) {
List textResult = new ArrayList();
Matcher arXivMatcher = TextUtilities.arXivPattern.matcher(text);
- while (arXivMatcher.find()) {
+ while (arXivMatcher.find()) {
//System.out.println(arXivMatcher.start() + " / " + arXivMatcher.end() + " / " + text.substring(arXivMatcher.start(), arXivMatcher.end()));
textResult.add(new OffsetPosition(arXivMatcher.start(), arXivMatcher.end()));
}
- return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+ return convertStringOffsetToTokenOffset(textResult, tokens);
}
@@ -1141,7 +1143,7 @@ public List tokenPositionsArXivPattern(List tokens,
*/
public List tokenPositionsISSNPattern(List tokens) {
List result = new ArrayList();
-
+
// TBD !
return result;
@@ -1161,50 +1163,71 @@ public List tokenPositionsISBNPattern(List tokens)
/**
* Identify in tokenized input the positions of an URL pattern with token positions
*/
- public List tokenPositionsUrlPattern(List tokens) {
- //List result = new ArrayList();
- String text = LayoutTokensUtil.toText(tokens);
- List textResult = new ArrayList();
- Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
- while (urlMatcher.find()) {
- //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end()));
- textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
- }
- return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+ public static List tokenPositionsUrlPattern(List tokens) {
+ List textResult = characterPositionsUrlPattern(tokens);
+ return convertStringOffsetToTokenOffset(textResult, tokens);
}
/**
* Identify in tokenized input the positions of an URL pattern with character positions
*/
- public List characterPositionsUrlPattern(List tokens) {
- //List result = new ArrayList();
+ public static List characterPositionsUrlPattern(List tokens) {
String text = LayoutTokensUtil.toText(tokens);
List textResult = new ArrayList();
Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
- while (urlMatcher.find()) {
+ while (urlMatcher.find()) {
textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
}
return textResult;
}
/**
- * Identify in tokenized input the positions of an URL pattern with character positions,
+ * Identify in tokenized input the positions of a URL pattern with character positions,
* and refine positions based on possible PDF URI annotations.
- *
+ *
* This will produce better quality recognized URL, avoiding missing suffixes and problems
* with break lines and spaces.
**/
public static List characterPositionsUrlPatternWithPdfAnnotations(
- List layoutTokens,
- List pdfAnnotations,
- String text) {
- List urlPositions = Lexicon.getInstance().characterPositionsUrlPattern(layoutTokens);
+ List layoutTokens,
+ List pdfAnnotations,
+ String text) {
+
+ List urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
+
+ // We need to adjust the end of the positions to avoid problems with the sublist
+ // that is used the following method
+ urlTokensPositions.stream().forEach(o -> o.end += 1);
+
+ // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
+ // which may be different (spaces, hypen, breakline)
+ return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions);
+ }
+
+ /**
+ * This method returns the token positions in respect of the layout tokens,
+ * the output token offsets are (included, included)
+ */
+ public static List tokenPositionUrlPatternWithPdfAnnotations(
+ List layoutTokens,
+ List pdfAnnotations) {
+
+ return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
+ }
+
+ /**
+ * This method returns the character offsets in relation to the string obtained by the layout tokens.
+ * Notice the absence of the String text parameter.
+ */
+ public static List characterPositionsUrlPatternWithPdfAnnotations(
+ List layoutTokens,
+ List pdfAnnotations) {
+ List urlPositions = Lexicon.characterPositionsUrlPattern(layoutTokens);
List resultPositions = new ArrayList<>();
// do we need to extend the url position based on additional position of the corresponding
// PDF annotation?
for(OffsetPosition urlPosition : urlPositions) {
-
int startPos = urlPosition.start;
int endPos = urlPosition.end;
@@ -1230,11 +1253,10 @@ public static List characterPositionsUrlPatternWithPdfAnnotation
tokenIndex++;
}
- //String urlString = LayoutTokensUtil.toText(urlTokens);
- String urlString = text.substring(startPos, endPos);
+ String urlString = LayoutTokensUtil.toText(urlTokens);
PDFAnnotation targetAnnotation = null;
- if (urlTokens.size()>0) {
+ if (CollectionUtils.isNotEmpty(urlTokens)) {
LayoutToken lastToken = urlTokens.get(urlTokens.size()-1);
if (pdfAnnotations != null) {
for (PDFAnnotation pdfAnnotation : pdfAnnotations) {
@@ -1253,7 +1275,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation
String destination = targetAnnotation.getDestination();
int destinationPos = 0;
- if (destination.indexOf(urlString) != -1) {
+ if (destination.contains(urlString)) {
destinationPos = destination.indexOf(urlString)+urlString.length();
}
@@ -1261,7 +1283,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation
for(int j=endTokensIndex+1; j characterPositionsUrlPatternWithPdfAnnotation
endPos += nextToken.getText().length();
destinationPos = pos + nextToken.getText().length();
urlTokens.add(nextToken);
- } else
+ } else
break;
}
}
}
// finally avoid ending a URL by a dot, because it can harm the sentence segmentation
- if (text.charAt(endPos-1) == '.')
- endPos = endPos-1;
+ if (StringUtils.substring(LayoutTokensUtil.toText(layoutTokens), startPos, endPos).endsWith(".")) {
+ endPos = endPos - 1;
+ }
OffsetPosition position = new OffsetPosition();
position.start = startPos;
@@ -1303,11 +1326,11 @@ public List tokenPositionsEmailPattern(List tokens)
return new ArrayList();
List textResult = new ArrayList();
Matcher emailMatcher = TextUtilities.emailPattern.matcher(text);
- while (emailMatcher.find()) {
+ while (emailMatcher.find()) {
//System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end()));
textResult.add(new OffsetPosition(emailMatcher.start(), emailMatcher.end()));
}
- return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+ return convertStringOffsetToTokenOffset(textResult, tokens);
}
}
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
index 06f69bcdee..f0e6cf03af 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -1556,4 +1556,59 @@ public static org.apache.commons.lang3.tuple.Pair matchTokenAndString(List layoutTokens, String text, List positions) {
+ List newPositions = new ArrayList<>();
+ StringBuilder accumulator = new StringBuilder();
+ int pos = 0;
+ int textPositionOfToken = 0;
+
+ for (OffsetPosition position : positions) {
+ List annotationTokens = layoutTokens.subList(position.start, position.end);
+ boolean first = true;
+ accumulator = new StringBuilder();
+ for (int i = 0; i < annotationTokens.size(); i++) {
+ LayoutToken token = annotationTokens.get(i);
+ if (StringUtils.isEmpty(token.getText()))
+ continue;
+ textPositionOfToken = text.indexOf(token.getText(), pos);
+ if (textPositionOfToken != -1) {
+ //We update pos only at the first token of the annotation positions
+ if (first) {
+ pos = textPositionOfToken;
+ first = false;
+ }
+ accumulator.append(token);
+ } else {
+ if (SentenceUtilities.toSkipToken(token.getText())) {
+ continue;
+ }
+ if (StringUtils.isNotEmpty(accumulator)) {
+ int accumulatorTextLength = accumulator.toString().length();
+ int start = text.indexOf(accumulator.toString(), pos);
+ int end = start + accumulatorTextLength;
+ newPositions.add(new OffsetPosition(start, end));
+ pos = end;
+ break;
+ }
+ pos = textPositionOfToken;
+ }
+ }
+ if (StringUtils.isNotEmpty(accumulator)) {
+ int annotationTextLength = accumulator.toString().length();
+ int start = text.indexOf(accumulator.toString(), pos);
+ int end = start + annotationTextLength;
+ newPositions.add(new OffsetPosition(start, end));
+ pos = end;
+ accumulator = new StringBuilder();
+ }
+
+ }
+ if (StringUtils.isNotEmpty(accumulator)) {
+ int start = text.indexOf(accumulator.toString(), pos);
+ newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
+ }
+
+ return newPositions;
+ }
}
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
index de88ab3aa4..d4a838a5df 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
@@ -504,7 +504,7 @@ public static List convertStringOffsetToTokenOffsetOld(
}
/**
- * This version uses actual LayoutToken offsets relative to the tokens present in argment only.
+ * This version uses actual LayoutToken offsets relative to the tokens present in argument only.
* It supposes that the stringPosition have been identified on the provided tokens only, and not
* restricted to the complete document.
*/
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
index 620f01a73a..2d888520ec 100755
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
@@ -1,13 +1,16 @@
package org.grobid.core.lexicon;
+import org.apache.commons.lang3.StringUtils;
import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.layout.PDFAnnotation;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.layout.LayoutToken;
-import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Test;
+import java.util.ArrayList;
import java.util.List;
import static org.hamcrest.CoreMatchers.is;
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
new file mode 100644
index 0000000000..8b3b501488
--- /dev/null
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -0,0 +1,247 @@
+package org.grobid.core.lexicon;
+
+import org.apache.commons.lang3.StringUtils;
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.layout.LayoutToken;
+import org.grobid.core.layout.PDFAnnotation;
+import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.OffsetPosition;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasSize;
+
+public class LexiconTest {
+ @Test
+ public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+ final String input = "This work was distributed on http:// github.com/myUsername/MyProject";
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+ List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition FirstURL = offsetPositions.get(0);
+ assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/myUsername/MyProject"));
+ }
+
+ @Test
+ public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+ final String input = "This work was distributed on http:// github.com/myUsername/MyProject";
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+ List offsetPositions = Lexicon.tokenPositionsUrlPattern(tokenisedInput);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition FirstURL = offsetPositions.get(0);
+ //Note: The intervals returned by the method Utilities.convertStringOffsetToTokenOffset
+ // consider the upper index to be included, while java consider the upper index to be excluded
+ assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject"));
+ }
+
+ @Test
+ @Ignore("This test will fail, it can be used to test a real case when updating the regular exception")
+ public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
+ final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+ List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url = offsetPositions.get(0);
+ assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
+ }
+
+ @Test
+ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception {
+ final String input = "1. 'internal status' indicates that their records should be \n" +
+ "hidden in the interface. \n" +
+ "2. In our previous work [1] we reported 77.03% F1-\n" +
+ "score. There is a slight decrease in absolute scores \n" +
+ "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" +
+ "may be the use of different hyperparameters in \n" +
+ "version 0.3.0 such as batch size and learning rate. \n" +
+ "However, the most probable cause could be the \n" +
+ "impact of using the Huggingface tokenizers \n" +
+ "library which is suffering from quality issues \n" +
+ "https://github.com/kermitt2/delft/issues/150. \n" +
+ "\n" +
+ "\n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+ //This is the actual text that is passed and is different from the layoutToken text.
+ final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. ";
+
+ PDFAnnotation annotation = new PDFAnnotation();
+ annotation.setPageNumber(10);
+ List boundingBoxes = new ArrayList<>();
+ boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68, 177.39, 9.52));
+ annotation.setBoundingBoxes(boundingBoxes);
+ annotation.setDestination("https://github.com/kermitt2/delft/issues/150");
+ annotation.setType(PDFAnnotation.Type.URI);
+
+ List pdfAnnotations = List.of(annotation);
+ List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url = offsetPositions.get(0);
+ assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150"));
+ }
+
+ @Test
+ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+ final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+ "supercon2. The repository contains the code of the \n" +
+ "SuperCon 2 interface, the curation workflow, and the \n" +
+ "\n" +
+ "Table 2. Data support, the number of entities for each label in \n" +
+ "each of the datasets used for evaluating the ML models. The \n" +
+ "base dataset is the original dataset described in [18], and the \n" +
+ "curation dataset is automatically collected based on the data-\n" +
+ "base corrections by the interface and manually corrected. \n" +
+ "\n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+ lastTokenOfTheURL.setPage(9);
+ lastTokenOfTheURL.setX(530.9363448275863);
+ lastTokenOfTheURL.setY(538.153);
+ lastTokenOfTheURL.setWidth(4.363655172413793);
+ lastTokenOfTheURL.setHeight(9.702);
+
+ PDFAnnotation annotation = new PDFAnnotation();
+ annotation.setPageNumber(9);
+ List boundingBoxes = new ArrayList<>();
+ boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+ annotation.setBoundingBoxes(boundingBoxes);
+ annotation.setDestination("https://github.com/lfoppiano/supercon2");
+ annotation.setType(PDFAnnotation.Type.URI);
+
+ List pdfAnnotations = List.of(annotation);
+ List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url = offsetPositions.get(0);
+ assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2"));
+ }
+
+ @Test
+ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+ final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+ "supercon2. The repository contains the code of the \n" +
+ "SuperCon 2 interface, the curation workflow, and the \n" +
+ "\n" +
+ "Table 2. Data support, the number of entities for each label in \n" +
+ "each of the datasets used for evaluating the ML models. The \n" +
+ "base dataset is the original dataset described in [18], and the \n" +
+ "curation dataset is automatically collected based on the data-\n" +
+ "base corrections by the interface and manually corrected. \n" +
+ "\n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+ lastTokenOfTheURL.setPage(9);
+ lastTokenOfTheURL.setX(530.9363448275863);
+ lastTokenOfTheURL.setY(538.153);
+ lastTokenOfTheURL.setWidth(4.363655172413793);
+ lastTokenOfTheURL.setHeight(9.702);
+
+ PDFAnnotation annotation = new PDFAnnotation();
+ annotation.setPageNumber(9);
+ List boundingBoxes = new ArrayList<>();
+ boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+ annotation.setBoundingBoxes(boundingBoxes);
+ annotation.setDestination("https://github.com/lfoppiano/supercon2");
+ annotation.setType(PDFAnnotation.Type.URI);
+
+ List pdfAnnotations = List.of(annotation);
+ List offsetPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url = offsetPositions.get(0);
+ // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive
+ assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2"));
+ }
+
+ @Test
+ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+ final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+ "supercon2. The repository contains the code of the \n" +
+ "SuperCon 2 interface, the curation workflow, and the \n" +
+ "\n" +
+ "Table 2. Data support, the number of entities for each label in \n" +
+ "each of the datasets used for evaluating the ML models. The \n" +
+ "base dataset is the original dataset described in [18], and the \n" +
+ "curation dataset is automatically collected based on the data-\n" +
+ "base corrections by the interface and manually corrected. \n" +
+ "\n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+ lastTokenOfTheURL.setPage(9);
+ lastTokenOfTheURL.setX(530.9363448275863);
+ lastTokenOfTheURL.setY(538.153);
+ lastTokenOfTheURL.setWidth(4.363655172413793);
+ lastTokenOfTheURL.setHeight(9.702);
+
+ PDFAnnotation annotation = new PDFAnnotation();
+ annotation.setPageNumber(9);
+ List boundingBoxes = new ArrayList<>();
+ boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+ annotation.setBoundingBoxes(boundingBoxes);
+ annotation.setDestination("https://github.com/lfoppiano/supercon2");
+ annotation.setType(PDFAnnotation.Type.URI);
+ List pdfAnnotations = List.of(annotation);
+
+ //This is the actual text that is passed and is different from the layoutToken text.
+ final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+ List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url = offsetPositions.get(0);
+ assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
+ }
+
+ @Test
+ public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+ final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
+ "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
+ "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
+ "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
+ "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
+ "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
+ lastTokenOfTheURL.setPage(19);
+ lastTokenOfTheURL.setX(465.54675000000003);
+ lastTokenOfTheURL.setY(404.908);
+ lastTokenOfTheURL.setWidth(68.727);
+ lastTokenOfTheURL.setHeight(9.0873);
+
+ PDFAnnotation annotation = new PDFAnnotation();
+ annotation.setPageNumber(19);
+ List boundingBoxes = new ArrayList<>();
+ boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
+ annotation.setBoundingBoxes(boundingBoxes);
+ annotation.setDestination("http://www.gencodegenes.org/releases/");
+ annotation.setType(PDFAnnotation.Type.URI);
+ List pdfAnnotations = List.of(annotation);
+
+ //This is the actual text that is passed and is different from the layoutToken text.
+ final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. ";
+
+ List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url = offsetPositions.get(0);
+ assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
+ }
+}
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
index ff5ac7467b..8b53cc263e 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
@@ -8,12 +8,13 @@
import org.junit.Test;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import static org.hamcrest.CoreMatchers.is;
-import static org.hamcrest.CoreMatchers.startsWith;
import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.*;
@@ -407,4 +408,166 @@ public void testOrcidPattern() {
}
}
}
+
+ @Test
+ public void testMatchTokenAndString() throws Exception {
+ final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+ "supercon2. The repository contains the code of the \n" +
+ "SuperCon 2 interface, the curation workflow, and the \n" +
+ "\n" +
+ "Table 2. Data support, the number of entities for each label in \n" +
+ "each of the datasets used for evaluating the ML models. The \n" +
+ "base dataset is the original dataset described in [18], and the \n" +
+ "curation dataset is automatically collected based on the data-\n" +
+ "base corrections by the interface and manually corrected. \n" +
+ "\n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+ List urlTokens = Arrays.asList(new OffsetPosition(10, 23));
+
+ List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+ assertThat(offsetPositions, hasSize(1));
+ OffsetPosition url1 = offsetPositions.get(0);
+ assertThat(url1.start, is(26));
+ assertThat(url1.end, is(65));
+ assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+ }
+
+
+ @Test
+ public void testMatchTokenAndString_twoElements() throws Exception {
+ final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+ "supercon2. The repository contains the code of the \n" +
+ "SuperCon 2 interface, the curation workflow, and the \n" +
+ "\n" +
+ "Table 2. Data support, the number of entities for each label in \n" +
+ "each of the datasets used for evaluating the ML models. The \n" +
+ "base dataset is the original dataset described in [18], and the \n" +
+ "curation dataset is automatically collected based on the data-\n" +
+ "base corrections by the interface and manually corrected. \n" +
+ "\n";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+ List urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23));
+
+ List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+ assertThat(offsetPositions, hasSize(2));
+ OffsetPosition url0 = offsetPositions.get(0);
+ assertThat(url0.start, is(0));
+ assertThat(url0.end, is(9));
+
+ assertThat(inputReal.substring(url0.start, url0.end), is("This work"));
+
+ OffsetPosition url1 = offsetPositions.get(1);
+ assertThat(url1.start, is(26));
+ assertThat(url1.end, is(65));
+
+ assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+ }
+
+ @Test
+ public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception {
+ final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ List urlTokens = Arrays.asList(
+ new OffsetPosition(0, 3),
+ new OffsetPosition(5, 8),
+ new OffsetPosition(10, 13),
+ new OffsetPosition(15, 18)
+ );
+
+ List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens);
+
+ assertThat(offsetPositions, hasSize(4));
+
+ OffsetPosition url0 = offsetPositions.get(0);
+ assertThat(url0.start, is(0));
+ assertThat(url0.end, is(19));
+
+ assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne"));
+
+ OffsetPosition url1 = offsetPositions.get(1);
+ assertThat(url1.start, is(21));
+ assertThat(url1.end, is(34));
+
+ assertThat(input.substring(url1.start, url1.end), is("Claudie Marec"));
+
+ OffsetPosition url2 = offsetPositions.get(2);
+ assertThat(url2.start, is(36));
+ assertThat(url2.end, is(49));
+
+ assertThat(input.substring(url2.start, url2.end), is("Claudie Marec"));
+
+ OffsetPosition url3 = offsetPositions.get(3);
+ assertThat(url3.start, is(51));
+ assertThat(url3.end, is(66));
+
+ assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder"));
+
+ }
+
+ @Test
+ public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception {
+ final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities.";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ List annotationTokenPositions = Arrays.asList(
+ new OffsetPosition(4, 7),
+ new OffsetPosition(9, 12),
+ new OffsetPosition(15, 18),
+ new OffsetPosition(27, 30),
+ new OffsetPosition(49, 52),
+ new OffsetPosition(71, 74),
+ new OffsetPosition(103, 106),
+ new OffsetPosition(109, 110),
+ new OffsetPosition(125, 126)
+ );
+
+ List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+ assertThat(offsetPositions, hasSize(9));
+
+ OffsetPosition url7 = offsetPositions.get(7);
+ assertThat(url7.start, is(349));
+ assertThat(url7.end, is(352));
+
+ assertThat(input.substring(url7.start, url7.end), is("IGC"));
+
+ OffsetPosition url8 = offsetPositions.get(8);
+ assertThat(url8.start, is(397));
+ assertThat(url8.end, is(400));
+
+ assertThat(input.substring(url8.start, url8.end), is("IGC"));
+
+ }
+
+ @Test
+ public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception {
+ final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter.";
+
+ List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ List annotationTokenPositions = Arrays.asList(
+ new OffsetPosition(4, 7),
+ new OffsetPosition(40, 41),
+ new OffsetPosition(62, 63),
+ new OffsetPosition(79, 84)
+ );
+
+ List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+ assertThat(offsetPositions, hasSize(4));
+
+ OffsetPosition url7 = offsetPositions.get(1);
+ assertThat(input.substring(url7.start, url7.end), is("NERC"));
+
+ OffsetPosition url8 = offsetPositions.get(2);
+ assertThat(input.substring(url8.start, url8.end), is("ERC"));
+ }
}
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
index 64d6d4be7a..9e5a6958ff 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
@@ -3,10 +3,17 @@
import java.io.File;
import java.io.IOException;
+import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.LayoutToken;
import org.junit.Test;
+
+import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
import static org.junit.Assert.*;
public class UtilitiesTest {
@@ -112,4 +119,16 @@ public void testMergePositionsOverlap() throws IOException {
assertEquals(positions.get(1).start, 7);
assertEquals(positions.get(1).end, 10);
}
+
+ @Test
+ public void testConvertStringOffsetToTokenOffset() throws Exception {
+ String input = "This is a token.";
+ List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+ OffsetPosition stringPosition = new OffsetPosition(5, 9);
+ List tokenOffsets = convertStringOffsetToTokenOffset(Arrays.asList(stringPosition), layoutTokens);
+
+ assertThat(tokenOffsets, hasSize(1));
+ OffsetPosition position = tokenOffsets.get(0);
+ assertThat(LayoutTokensUtil.toText(layoutTokens.subList(position.start, position.end + 1)), is("is a"));
+ }
}
]