From 9d9455ae9d663822c76a1f94a66dd3375d986e74 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 9 Apr 2024 07:59:21 +0900 Subject: [PATCH 01/14] add URL detection to avoid split them when running the sentence segmenter --- .../src/main/java/org/grobid/core/document/TEIFormatter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index f66baaa0c0..33affa2d03 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -28,6 +28,7 @@ import org.grobid.core.exceptions.GrobidException; import org.grobid.core.lang.Language; import org.grobid.core.layout.*; +import org.grobid.core.lexicon.Lexicon; import org.grobid.core.utilities.SentenceUtilities; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; @@ -1868,6 +1869,9 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } + List offsetPositionsUrls = Lexicon.getInstance().characterPositionsUrlPattern(curParagraphTokens); + forbiddenPositions.addAll(offsetPositionsUrls); + List theSentences = SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); From cff813863dbff4d31552d15a8e9e93fdf388d27c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 11 Apr 2024 18:14:03 +0900 Subject: [PATCH 02/14] update lexicon and add more integration tests --- .../java/org/grobid/core/lexicon/Lexicon.java | 173 ++++++++++++------ .../core/lexicon/LexiconIntegrationTest.java | 159 +++++++++++++++- 2 files changed, 270 insertions(+), 62 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index 271dd6fa77..9666ad0c85 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -20,7 +20,9 @@ import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.exceptions.GrobidResourceException; import org.grobid.core.lang.Language; @@ -36,7 +38,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.lang3.tuple.Pair; +import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset; /** * Class for managing all the lexical resources. @@ -101,19 +103,19 @@ private Lexicon() { initDictionary(); initNames(); // the loading of the journal and conference names is lazy - addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + + addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"wordforms"+File.separator+"english.wf", Language.EN); - addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + + addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"wordforms"+File.separator+"german.wf", Language.EN); addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"names"+File.separator+"names.family"); addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"names"+File.separator+"lastname.5k"); - addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + + addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"names"+File.separator+"names.female"); - addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + + addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"names"+File.separator+"names.male"); - addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + + addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"+File.separator+"names"+File.separator+"firstname.5k"); initCountryCodes(); addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator + @@ -465,33 +467,33 @@ public void initOrganisations() { try { organisationPattern = new FastMatcher(new File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/government.government_agency")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/known_corporations.lst")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/venture_capital.venture_funded_company")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e); } catch (IOException e) { - throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " + + throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " + "does not exist or cannot be read.", e); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); } } - + public void initOrgForms() { try { orgFormPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); } } - + public void initLocations() { try { locationPattern = new FastMatcher(new @@ -522,8 +524,8 @@ public void initPersonSuffix() { public void initFunders() { try { funderPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), - GrobidAnalyzer.getInstance(), true); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), + GrobidAnalyzer.getInstance(), true); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for funders.", e); } catch (Exception e) { @@ -534,8 +536,8 @@ public void initFunders() { public void initResearchInfrastructures() { try { researchInfrastructurePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), - GrobidAnalyzer.getInstance(), true); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), + GrobidAnalyzer.getInstance(), true); // store some name mapping researchOrganizations = new TreeMap<>(); @@ -563,7 +565,7 @@ public void initResearchInfrastructures() { String[] pieces = line.split(";", -1); // -1 for getting empty tokens too if (pieces.length == 3) { if (pieces[0].length() > 0) { - + if (pieces[1].length() > 0) { OrganizationRecord localInfra = new OrganizationRecord(pieces[0], pieces[1], "en"); List localInfraList = researchOrganizations.get(pieces[0].toLowerCase()); @@ -608,7 +610,7 @@ public void initResearchInfrastructures() { throw new GrobidResourceException("Error when compiling lexicon matcher for research infrastructure.", e); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); - } + } } /** @@ -642,15 +644,15 @@ public boolean isPunctuation(String s) { public List getOrganizationNamingInfo(String name) { if (researchOrganizations == null) return null; - return researchOrganizations.get(name.toLowerCase()); + return researchOrganizations.get(name.toLowerCase()); } /** * Map the language codes used by the language identifier component to the normal * language name. * - * Note: due to an older bug, kr is currently map to Korean too - this should - * disappear at some point in the future after retraining of models + * Note: due to an older bug, kr is currently map to Korean too - this should + * disappear at some point in the future after retraining of models * * @param code the language to be mapped */ @@ -896,7 +898,7 @@ public List charPositionsOrganisationNames(String s) { /** * Soft look-up in organisation names gazetteer for a tokenize sequence. - * It return a list of positions referring to the character positions within the input + * It return a list of positions referring to the character positions within the input * sequence. * * @param s the input list of LayoutToken @@ -987,7 +989,7 @@ public List tokenPositionsLocationNames(List s) { } /** - * Soft look-up in location name gazetteer for a string, return a list of positions referring + * Soft look-up in location name gazetteer for a string, return a list of positions referring * to the character positions within the string. * * For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19) @@ -1004,7 +1006,7 @@ public List charPositionsLocationNames(String s) { } /** - * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of + * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of * positions referring to the character positions in the input sequence. * * For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19) @@ -1092,7 +1094,7 @@ public List charPositionsPersonTitle(List s) { public List tokenPositionsIdentifierPattern(List tokens) { List result = new ArrayList(); String text = LayoutTokensUtil.toText(tokens); - + // DOI positions result = tokenPositionsDOIPattern(tokens, text); @@ -1115,10 +1117,10 @@ public List tokenPositionsIdentifierPattern(List to public List tokenPositionsDOIPattern(List tokens, String text) { List textResult = new ArrayList(); Matcher doiMatcher = TextUtilities.DOIPattern.matcher(text); - while (doiMatcher.find()) { + while (doiMatcher.find()) { textResult.add(new OffsetPosition(doiMatcher.start(), doiMatcher.end())); } - return Utilities.convertStringOffsetToTokenOffset(textResult, tokens); + return convertStringOffsetToTokenOffset(textResult, tokens); } /** @@ -1128,11 +1130,11 @@ public List tokenPositionsDOIPattern(List tokens, S public List tokenPositionsArXivPattern(List tokens, String text) { List textResult = new ArrayList(); Matcher arXivMatcher = TextUtilities.arXivPattern.matcher(text); - while (arXivMatcher.find()) { + while (arXivMatcher.find()) { //System.out.println(arXivMatcher.start() + " / " + arXivMatcher.end() + " / " + text.substring(arXivMatcher.start(), arXivMatcher.end())); textResult.add(new OffsetPosition(arXivMatcher.start(), arXivMatcher.end())); } - return Utilities.convertStringOffsetToTokenOffset(textResult, tokens); + return convertStringOffsetToTokenOffset(textResult, tokens); } @@ -1141,7 +1143,7 @@ public List tokenPositionsArXivPattern(List tokens, */ public List tokenPositionsISSNPattern(List tokens) { List result = new ArrayList(); - + // TBD ! return result; @@ -1161,50 +1163,99 @@ public List tokenPositionsISBNPattern(List tokens) /** * Identify in tokenized input the positions of an URL pattern with token positions */ - public List tokenPositionsUrlPattern(List tokens) { - //List result = new ArrayList(); - String text = LayoutTokensUtil.toText(tokens); - List textResult = new ArrayList(); - Matcher urlMatcher = TextUtilities.urlPattern.matcher(text); - while (urlMatcher.find()) { - //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end())); - textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end())); - } - return Utilities.convertStringOffsetToTokenOffset(textResult, tokens); + public static List tokenPositionsUrlPattern(List tokens) { + List textResult = characterPositionsUrlPattern(tokens); + return convertStringOffsetToTokenOffset(textResult, tokens); } /** * Identify in tokenized input the positions of an URL pattern with character positions */ - public List characterPositionsUrlPattern(List tokens) { - //List result = new ArrayList(); + public static List characterPositionsUrlPattern(List tokens) { String text = LayoutTokensUtil.toText(tokens); List textResult = new ArrayList(); Matcher urlMatcher = TextUtilities.urlPattern.matcher(text); - while (urlMatcher.find()) { + while (urlMatcher.find()) { textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end())); } return textResult; } /** - * Identify in tokenized input the positions of an URL pattern with character positions, + * Identify in tokenized input the positions of a URL pattern with character positions, * and refine positions based on possible PDF URI annotations. - * + * * This will produce better quality recognized URL, avoiding missing suffixes and problems * with break lines and spaces. **/ +// public static List characterPositionsUrlPatternWithPdfAnnotations( +// List layoutTokens, +// List pdfAnnotations, +// String text) { +// +// List urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); +// +// // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text +// // which may be different (spaces, hypen, breakline) +// StringBuilder accumulator = new StringBuilder(); +// List tokenizedText = GrobidAnalyzer.getInstance().tokenize(text); +// +// for (OffsetPosition urlOffsetPosition : urlTokensPositions) { +// int startTokenPosition = urlOffsetPosition.start; +// int endTokenPosition = urlOffsetPosition.end; +// +// List urlTokens = layoutTokens.subList(startTokenPosition, endTokenPosition); +// +// int tokenIndex = 0; +// int startPosition = 0; +// int endPosition = 0; +// for (LayoutToken token : urlTokens) { +// String tokenText = token.getText(); +// int textIndex = 0; +// for (int i = tokenIndex; i tokensPositionUrlPatternWithPdfAnnotations( + List layoutTokens, + List pdfAnnotations) { + + return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); + } + + /** + * This method returns the character offsets in relation to the string obtained by the layout tokens. + * Notice the absence of the String text parameter. + */ public static List characterPositionsUrlPatternWithPdfAnnotations( - List layoutTokens, - List pdfAnnotations, - String text) { - List urlPositions = Lexicon.getInstance().characterPositionsUrlPattern(layoutTokens); + List layoutTokens, + List pdfAnnotations) { + List urlPositions = Lexicon.characterPositionsUrlPattern(layoutTokens); List resultPositions = new ArrayList<>(); // do we need to extend the url position based on additional position of the corresponding // PDF annotation? for(OffsetPosition urlPosition : urlPositions) { - int startPos = urlPosition.start; int endPos = urlPosition.end; @@ -1230,11 +1281,10 @@ public static List characterPositionsUrlPatternWithPdfAnnotation tokenIndex++; } - //String urlString = LayoutTokensUtil.toText(urlTokens); - String urlString = text.substring(startPos, endPos); + String urlString = LayoutTokensUtil.toText(urlTokens); PDFAnnotation targetAnnotation = null; - if (urlTokens.size()>0) { + if (CollectionUtils.isNotEmpty(urlTokens)) { LayoutToken lastToken = urlTokens.get(urlTokens.size()-1); if (pdfAnnotations != null) { for (PDFAnnotation pdfAnnotation : pdfAnnotations) { @@ -1253,7 +1303,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation String destination = targetAnnotation.getDestination(); int destinationPos = 0; - if (destination.indexOf(urlString) != -1) { + if (destination.contains(urlString)) { destinationPos = destination.indexOf(urlString)+urlString.length(); } @@ -1261,7 +1311,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation for(int j=endTokensIndex+1; j characterPositionsUrlPatternWithPdfAnnotation endPos += nextToken.getText().length(); destinationPos = pos + nextToken.getText().length(); urlTokens.add(nextToken); - } else + } else break; } } } // finally avoid ending a URL by a dot, because it can harm the sentence segmentation - if (text.charAt(endPos-1) == '.') - endPos = endPos-1; + if (StringUtils.substring(LayoutTokensUtil.toText(layoutTokens), startPos, endPos).endsWith(".")) { + endPos = endPos - 1; + } OffsetPosition position = new OffsetPosition(); position.start = startPos; @@ -1303,11 +1354,11 @@ public List tokenPositionsEmailPattern(List tokens) return new ArrayList(); List textResult = new ArrayList(); Matcher emailMatcher = TextUtilities.emailPattern.matcher(text); - while (emailMatcher.find()) { + while (emailMatcher.find()) { //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end())); textResult.add(new OffsetPosition(emailMatcher.start(), emailMatcher.end())); } - return Utilities.convertStringOffsetToTokenOffset(textResult, tokens); + return convertStringOffsetToTokenOffset(textResult, tokens); } } diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java index 620f01a73a..9f42a7ce7a 100755 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java @@ -1,13 +1,16 @@ package org.grobid.core.lexicon; +import org.apache.commons.lang3.StringUtils; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.layout.BoundingBox; +import org.grobid.core.layout.PDFAnnotation; import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.layout.LayoutToken; -import org.junit.AfterClass; import org.junit.Before; import org.junit.Test; +import java.util.ArrayList; import java.util.List; import static org.hamcrest.CoreMatchers.is; @@ -407,4 +410,158 @@ public void testinFunders1Match() throws Exception { assertThat(positions.get(0).start, is(4)); assertThat(positions.get(0).end, is(6)); } + +// @Test +// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception { +// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject"; +// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// List offsetPositions = target.characterPositionsUrlPattern(tokenisedInput); +// +// assertThat(offsetPositions, hasSize(1)); +// OffsetPosition FirstURL = offsetPositions.get(0); +// assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject")); +// } +// +// @Test +// public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception { +// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola."; +// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// List offsetPositions = target.characterPositionsUrlPattern(tokenisedInput); +// +// assertThat(offsetPositions, hasSize(2)); +// OffsetPosition url = offsetPositions.get(1); +// assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola")); +// } +// +// @Test +// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception { +// final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)"; +// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// List offsetPositions = target.characterPositionsUrlPattern(tokenisedInput); +// +// assertThat(offsetPositions, hasSize(1)); +// OffsetPosition url = offsetPositions.get(0); +// assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf")); +// } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception { + final String input = "1. 'internal status' indicates that their records should be \n" + + "hidden in the interface. \n" + + "2. In our previous work [1] we reported 77.03% F1-\n" + + "score. There is a slight decrease in absolute scores \n" + + "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" + + "may be the use of different hyperparameters in \n" + + "version 0.3.0 such as batch size and learning rate. \n" + + "However, the most probable cause could be the \n" + + "impact of using the Huggingface tokenizers \n" + + "library which is suffering from quality issues \n" + + "https://github.com/kermitt2/delft/issues/150. \n" + + "\n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. "; + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(10); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68,177.39,9.52)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/kermitt2/delft/issues/150"); + annotation.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation); + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150")); + } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); + lastTokenOfTheURL.setPage(9); + lastTokenOfTheURL.setX(530.9363448275863); + lastTokenOfTheURL.setY(538.153); + lastTokenOfTheURL.setWidth(4.363655172413793); + lastTokenOfTheURL.setHeight(9.702); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(9); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/lfoppiano/supercon2"); + annotation.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation); + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2")); + } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); + lastTokenOfTheURL.setPage(9); + lastTokenOfTheURL.setX(530.9363448275863); + lastTokenOfTheURL.setY(538.153); + lastTokenOfTheURL.setWidth(4.363655172413793); + lastTokenOfTheURL.setHeight(9.702); + + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(9); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/lfoppiano/supercon2"); + annotation.setType(PDFAnnotation.Type.URI); + List pdfAnnotations = List.of(annotation); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2")); + } } \ No newline at end of file From dcda0dc51bab163020d41690733c52be365989c9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 11 Apr 2024 18:14:22 +0900 Subject: [PATCH 03/14] typos --- .../org/grobid/core/engines/FundingAcknowledgementParser.java | 4 ++-- .../src/main/java/org/grobid/core/utilities/Utilities.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index c92b270ff1..1068be3e28 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -102,7 +102,7 @@ public MutablePair,List,List,List,List convertStringOffsetToTokenOffsetOld( } /** - * This version uses actual LayoutToken offsets relative to the tokens present in argment only. + * This version uses actual LayoutToken offsets relative to the tokens present in argument only. * It supposes that the stringPosition have been identified on the provided tokens only, and not * restricted to the complete document. */ From a3cc84e464f181bc4d63879f949c66d8a65c2590 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 11 Apr 2024 20:47:28 +0900 Subject: [PATCH 04/14] Add test --- .../core/lexicon/LexiconIntegrationTest.java | 40 +++++++++++++++++++ .../grobid/core/utilities/UtilitiesTest.java | 19 +++++++++ 2 files changed, 59 insertions(+) diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java index 9f42a7ce7a..e312871c08 100755 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java @@ -524,6 +524,46 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2")); } + @Test + public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); + lastTokenOfTheURL.setPage(9); + lastTokenOfTheURL.setX(530.9363448275863); + lastTokenOfTheURL.setY(538.153); + lastTokenOfTheURL.setWidth(4.363655172413793); + lastTokenOfTheURL.setHeight(9.702); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(9); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/lfoppiano/supercon2"); + annotation.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation); + List offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2")); + } + @Test public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { final String input = "This work is available at https://github.com/lfoppiano/ \n" + diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java index 64d6d4be7a..9e5a6958ff 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java @@ -3,10 +3,17 @@ import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.List; import java.util.ArrayList; +import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.layout.LayoutToken; import org.junit.Test; + +import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; import static org.junit.Assert.*; public class UtilitiesTest { @@ -112,4 +119,16 @@ public void testMergePositionsOverlap() throws IOException { assertEquals(positions.get(1).start, 7); assertEquals(positions.get(1).end, 10); } + + @Test + public void testConvertStringOffsetToTokenOffset() throws Exception { + String input = "This is a token."; + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + OffsetPosition stringPosition = new OffsetPosition(5, 9); + List tokenOffsets = convertStringOffsetToTokenOffset(Arrays.asList(stringPosition), layoutTokens); + + assertThat(tokenOffsets, hasSize(1)); + OffsetPosition position = tokenOffsets.get(0); + assertThat(LayoutTokensUtil.toText(layoutTokens.subList(position.start, position.end + 1)), is("is a")); + } } From ddd9336d3e42e97e9aa5622886ab4c60fedd174c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 11 Apr 2024 20:48:14 +0900 Subject: [PATCH 05/14] improvements --- .../main/java/org/grobid/core/engines/FullTextParser.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 28eda7e693..953d92f8b1 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -11,6 +11,7 @@ import java.nio.charset.StandardCharsets; +import org.apache.lucene.util.CollectionUtil; import org.grobid.core.GrobidModels; import org.grobid.core.data.*; import org.grobid.core.document.Document; @@ -478,7 +479,7 @@ public Pair> processShort(List tokens, Do List currentChunk = new ArrayList<>(); int currentPos = 0; for(LayoutToken token : tokens) { - if (currentChunk.size() != 0) { + if (CollectionUtils.isNotEmpty(currentChunk)) { int tokenPos = token.getOffset(); if (currentPos != tokenPos) { // new chunk @@ -508,7 +509,7 @@ public Pair> processShort(List tokens, Do LayoutTokenization layouts = featSeg.getRight(); if (layouts != null) layoutTokenization = layouts.getTokenization(); - if ( (featuredText != null) && (featuredText.trim().length() > 0) ) { + if (StringUtils.isNotBlank(featuredText)) { res = label(featuredText); res = postProcessFullTextLabeledText(res); } From ca3c3524ae4422b56f23d9e2cefef8174a926e8b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 12 Apr 2024 06:27:22 +0900 Subject: [PATCH 06/14] add method to match the offset from the layout token raw string to a "postprocessed" text --- .../java/org/grobid/core/data/Figure.java | 2 +- .../grobid/core/document/TEIFormatter.java | 22 ++++--- .../java/org/grobid/core/lexicon/Lexicon.java | 62 +++++-------------- .../grobid/core/utilities/TextUtilities.java | 43 +++++++++++++ .../core/utilities/TextUtilitiesTest.java | 29 +++++++++ 5 files changed, 103 insertions(+), 55 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index e9417e9217..ef4117e93c 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -432,7 +432,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } if (desc != null && config.isWithSentenceSegmentation()) { - formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); // we need a sentence segmentation of the figure caption, for that we need to introduce // a
, then a

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 33affa2d03..20a7746388 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1321,7 +1321,7 @@ private StringBuilder toTEINote(StringBuilder tei, if (config.isWithSentenceSegmentation()) { - segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage()); + segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } desc.appendChild(pNote); @@ -1523,7 +1523,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens); if (isNewParagraph(lastClusterLabel, curParagraph)) { if (curParagraph != null && config.isWithSentenceSegmentation()) { - segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage()); + segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } curParagraph = teiElement("p"); if (config.isGenerateTeiIds()) { @@ -1551,7 +1551,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } else { if (isNewParagraph(lastClusterLabel, curParagraph)) { if (curParagraph != null && config.isWithSentenceSegmentation()) { - segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage()); + segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } curParagraph = teiElement("p"); if (config.isGenerateTeiIds()) { @@ -1767,7 +1767,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, // in case we segment paragraph into sentences, we still need to do it for the last paragraph if (curParagraph != null && config.isWithSentenceSegmentation()) { - segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage()); + segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } // remove possibly empty div in the div list @@ -1834,6 +1834,10 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curP } public void segmentIntoSentences(Element curParagraph, List curParagraphTokens, GrobidAnalysisConfig config, String lang) { + segmentIntoSentences(curParagraph, curParagraphTokens, config, lang, new ArrayList<>()); + } + + public void segmentIntoSentences(Element curParagraph, List curParagraphTokens, GrobidAnalysisConfig config, String lang, List annotations) { // in order to avoid having a sentence boundary in the middle of a ref element // (which is frequent given the abbreviation in the reference expression, e.g. Fig.) // we only consider for sentence segmentation texts under

and skip the text under . @@ -1842,7 +1846,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara // in xom, the following gives all the text under the element, for the whole subtree String text = curParagraph.getValue(); - if (text == null || text.length() == 0) + if (StringUtils.isEmpty(text)) return; // identify ref nodes, ref spans and ref positions @@ -1859,8 +1863,8 @@ public void segmentIntoSentences(Element curParagraph, List curPara // for readability in another conditional if (((Element) theNode).getLocalName().equals("ref")) { // map character offset of the node - mapRefNodes.put(Integer.valueOf(pos), theNode); - refPositions.add(Integer.valueOf(pos)); + mapRefNodes.put(pos, theNode); + refPositions.add(pos); String chunk = theNode.getValue(); forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length())); @@ -1869,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - List offsetPositionsUrls = Lexicon.getInstance().characterPositionsUrlPattern(curParagraphTokens); + List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations); forbiddenPositions.addAll(offsetPositionsUrls); List theSentences = @@ -1894,7 +1898,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara for(int i=0; i characterPositionsUrlPattern(List characterPositionsUrlPatternWithPdfAnnotations( -// List layoutTokens, -// List pdfAnnotations, -// String text) { -// -// List urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); -// -// // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text -// // which may be different (spaces, hypen, breakline) -// StringBuilder accumulator = new StringBuilder(); -// List tokenizedText = GrobidAnalyzer.getInstance().tokenize(text); -// -// for (OffsetPosition urlOffsetPosition : urlTokensPositions) { -// int startTokenPosition = urlOffsetPosition.start; -// int endTokenPosition = urlOffsetPosition.end; -// -// List urlTokens = layoutTokens.subList(startTokenPosition, endTokenPosition); -// -// int tokenIndex = 0; -// int startPosition = 0; -// int endPosition = 0; -// for (LayoutToken token : urlTokens) { -// String tokenText = token.getText(); -// int textIndex = 0; -// for (int i = tokenIndex; i characterPositionsUrlPatternWithPdfAnnotations( + List layoutTokens, + List pdfAnnotations, + String text) { + + List urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); + + // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text + // which may be different (spaces, hypen, breakline) + return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions); + } /** * This method returns the token positions in respect of the layout tokens @@ -1240,7 +1207,12 @@ public static List tokensPositionUrlPatternWithPdfAnnotations( List layoutTokens, List pdfAnnotations) { - return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); + List offsetPositions = convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); + // We need to adjust the end of the positions to avoid problems with the sublist + + offsetPositions.stream().forEach(o -> o.end += 1); + + return offsetPositions; } /** diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 06f69bcdee..87224cace8 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1556,4 +1556,47 @@ public static org.apache.commons.lang3.tuple.Pair matchTokenAndString(List layoutTokens, String text, List urlPositions) { + List newPositions = new ArrayList<>(); + StringBuilder accumulator = new StringBuilder(); + int pos = 0; + + for (OffsetPosition urlPosition : urlPositions) { + List urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end); + boolean first = true; + for (int i = 0; i < urlTokens.size(); i++) { + LayoutToken token = urlTokens.get(i); + if (StringUtils.isEmpty(token.getText())) + continue; + int newPos = text.indexOf(token.getText(), pos); + if (newPos != -1) { + if (first) { + pos = newPos; + first = false; + } + accumulator.append(token); + } else { + if (SentenceUtilities.toSkipToken(token.getText())) { + continue; + } + if (StringUtils.isNotEmpty(accumulator)) { + int start = text.indexOf(accumulator.toString(), pos); + newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); + accumulator = new StringBuilder(); + pos = newPos; + first = true; + break; + } + pos = newPos; + } + } + } + if (StringUtils.isNotEmpty(accumulator)) { + int start = text.indexOf(accumulator.toString(), pos); + newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); + } + + return newPositions; + } } diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index ff5ac7467b..6303dc6450 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -8,12 +8,14 @@ import org.junit.Test; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.startsWith; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.*; @@ -407,4 +409,31 @@ public void testOrcidPattern() { } } } + + @Test + public void testMatchTokenAndString() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + List urlTokens = Arrays.asList(new OffsetPosition(10, 23)); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url1 = offsetPositions.get(0); + assertThat(url1.start, is(26)); + assertThat(url1.end, is(65)); + assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); + + } } From fbbf254b7ae8db3d33b6421488c742f008082a0b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 12 Apr 2024 06:36:09 +0900 Subject: [PATCH 07/14] Use a lexicon normal test for static methods --- .../core/lexicon/LexiconIntegrationTest.java | 194 ---------------- .../org/grobid/core/lexicon/LexiconTest.java | 213 ++++++++++++++++++ 2 files changed, 213 insertions(+), 194 deletions(-) create mode 100644 grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java index e312871c08..2d888520ec 100755 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java @@ -410,198 +410,4 @@ public void testinFunders1Match() throws Exception { assertThat(positions.get(0).start, is(4)); assertThat(positions.get(0).end, is(6)); } - -// @Test -// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception { -// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject"; -// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); -// -// List offsetPositions = target.characterPositionsUrlPattern(tokenisedInput); -// -// assertThat(offsetPositions, hasSize(1)); -// OffsetPosition FirstURL = offsetPositions.get(0); -// assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject")); -// } -// -// @Test -// public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception { -// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola."; -// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); -// -// List offsetPositions = target.characterPositionsUrlPattern(tokenisedInput); -// -// assertThat(offsetPositions, hasSize(2)); -// OffsetPosition url = offsetPositions.get(1); -// assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola")); -// } -// -// @Test -// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception { -// final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)"; -// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); -// -// List offsetPositions = target.characterPositionsUrlPattern(tokenisedInput); -// -// assertThat(offsetPositions, hasSize(1)); -// OffsetPosition url = offsetPositions.get(0); -// assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf")); -// } - - @Test - public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception { - final String input = "1. 'internal status' indicates that their records should be \n" + - "hidden in the interface. \n" + - "2. In our previous work [1] we reported 77.03% F1-\n" + - "score. There is a slight decrease in absolute scores \n" + - "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" + - "may be the use of different hyperparameters in \n" + - "version 0.3.0 such as batch size and learning rate. \n" + - "However, the most probable cause could be the \n" + - "impact of using the Huggingface tokenizers \n" + - "library which is suffering from quality issues \n" + - "https://github.com/kermitt2/delft/issues/150. \n" + - "\n" + - "\n"; - - List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); - - //This is the actual text that is passed and is different from the layoutToken text. - final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. "; - - PDFAnnotation annotation = new PDFAnnotation(); - annotation.setPageNumber(10); - List boundingBoxes = new ArrayList<>(); - boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68,177.39,9.52)); - annotation.setBoundingBoxes(boundingBoxes); - annotation.setDestination("https://github.com/kermitt2/delft/issues/150"); - annotation.setType(PDFAnnotation.Type.URI); - - List pdfAnnotations = List.of(annotation); - List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); - - assertThat(offsetPositions, hasSize(1)); - OffsetPosition url = offsetPositions.get(0); - assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150")); - } - - @Test - public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception { - final String input = "This work is available at https://github.com/lfoppiano/ \n" + - "supercon2. The repository contains the code of the \n" + - "SuperCon 2 interface, the curation workflow, and the \n" + - "\n" + - "Table 2. Data support, the number of entities for each label in \n" + - "each of the datasets used for evaluating the ML models. The \n" + - "base dataset is the original dataset described in [18], and the \n" + - "curation dataset is automatically collected based on the data-\n" + - "base corrections by the interface and manually corrected. \n" + - "\n"; - - List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); - LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); - lastTokenOfTheURL.setPage(9); - lastTokenOfTheURL.setX(530.9363448275863); - lastTokenOfTheURL.setY(538.153); - lastTokenOfTheURL.setWidth(4.363655172413793); - lastTokenOfTheURL.setHeight(9.702); - - //This is the actual text that is passed and is different from the layoutToken text. - final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; - - PDFAnnotation annotation = new PDFAnnotation(); - annotation.setPageNumber(9); - List boundingBoxes = new ArrayList<>(); - boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49)); - annotation.setBoundingBoxes(boundingBoxes); - annotation.setDestination("https://github.com/lfoppiano/supercon2"); - annotation.setType(PDFAnnotation.Type.URI); - - List pdfAnnotations = List.of(annotation); - List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); - - assertThat(offsetPositions, hasSize(1)); - OffsetPosition url = offsetPositions.get(0); - assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2")); - } - - @Test - public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception { - final String input = "This work is available at https://github.com/lfoppiano/ \n" + - "supercon2. The repository contains the code of the \n" + - "SuperCon 2 interface, the curation workflow, and the \n" + - "\n" + - "Table 2. Data support, the number of entities for each label in \n" + - "each of the datasets used for evaluating the ML models. The \n" + - "base dataset is the original dataset described in [18], and the \n" + - "curation dataset is automatically collected based on the data-\n" + - "base corrections by the interface and manually corrected. \n" + - "\n"; - - List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); - LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); - lastTokenOfTheURL.setPage(9); - lastTokenOfTheURL.setX(530.9363448275863); - lastTokenOfTheURL.setY(538.153); - lastTokenOfTheURL.setWidth(4.363655172413793); - lastTokenOfTheURL.setHeight(9.702); - - //This is the actual text that is passed and is different from the layoutToken text. - final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; - - PDFAnnotation annotation = new PDFAnnotation(); - annotation.setPageNumber(9); - List boundingBoxes = new ArrayList<>(); - boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49)); - annotation.setBoundingBoxes(boundingBoxes); - annotation.setDestination("https://github.com/lfoppiano/supercon2"); - annotation.setType(PDFAnnotation.Type.URI); - - List pdfAnnotations = List.of(annotation); - List offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); - - assertThat(offsetPositions, hasSize(1)); - OffsetPosition url = offsetPositions.get(0); - assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2")); - } - - @Test - public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { - final String input = "This work is available at https://github.com/lfoppiano/ \n" + - "supercon2. The repository contains the code of the \n" + - "SuperCon 2 interface, the curation workflow, and the \n" + - "\n" + - "Table 2. Data support, the number of entities for each label in \n" + - "each of the datasets used for evaluating the ML models. The \n" + - "base dataset is the original dataset described in [18], and the \n" + - "curation dataset is automatically collected based on the data-\n" + - "base corrections by the interface and manually corrected. \n" + - "\n"; - - List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); - LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); - lastTokenOfTheURL.setPage(9); - lastTokenOfTheURL.setX(530.9363448275863); - lastTokenOfTheURL.setY(538.153); - lastTokenOfTheURL.setWidth(4.363655172413793); - lastTokenOfTheURL.setHeight(9.702); - - - PDFAnnotation annotation = new PDFAnnotation(); - annotation.setPageNumber(9); - List boundingBoxes = new ArrayList<>(); - boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49)); - annotation.setBoundingBoxes(boundingBoxes); - annotation.setDestination("https://github.com/lfoppiano/supercon2"); - annotation.setType(PDFAnnotation.Type.URI); - List pdfAnnotations = List.of(annotation); - - //This is the actual text that is passed and is different from the layoutToken text. - final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; - - List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal); - - assertThat(offsetPositions, hasSize(1)); - OffsetPosition url = offsetPositions.get(0); - assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2")); - } } \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java new file mode 100644 index 0000000000..f9d1e4d7c4 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -0,0 +1,213 @@ +package org.grobid.core.lexicon; + +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.layout.BoundingBox; +import org.grobid.core.layout.LayoutToken; +import org.grobid.core.layout.PDFAnnotation; +import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasSize; + +public class LexiconTest { +// @Test +// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception { +// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject"; +// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); +// +// assertThat(offsetPositions, hasSize(1)); +// OffsetPosition FirstURL = offsetPositions.get(0); +// assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject")); +// } +// +// @Test +// public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception { +// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola."; +// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); +// +// assertThat(offsetPositions, hasSize(2)); +// OffsetPosition url = offsetPositions.get(1); +// assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola")); +// } +// +// @Test +// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception { +// final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)"; +// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); +// +// assertThat(offsetPositions, hasSize(1)); +// OffsetPosition url = offsetPositions.get(0); +// assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf")); +// } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception { + final String input = "1. 'internal status' indicates that their records should be \n" + + "hidden in the interface. \n" + + "2. In our previous work [1] we reported 77.03% F1-\n" + + "score. There is a slight decrease in absolute scores \n" + + "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" + + "may be the use of different hyperparameters in \n" + + "version 0.3.0 such as batch size and learning rate. \n" + + "However, the most probable cause could be the \n" + + "impact of using the Huggingface tokenizers \n" + + "library which is suffering from quality issues \n" + + "https://github.com/kermitt2/delft/issues/150. \n" + + "\n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. "; + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(10); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68, 177.39, 9.52)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/kermitt2/delft/issues/150"); + annotation.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation); + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150")); + } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); + lastTokenOfTheURL.setPage(9); + lastTokenOfTheURL.setX(530.9363448275863); + lastTokenOfTheURL.setY(538.153); + lastTokenOfTheURL.setWidth(4.363655172413793); + lastTokenOfTheURL.setHeight(9.702); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(9); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/lfoppiano/supercon2"); + annotation.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation); + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2")); + } + + @Test + public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); + lastTokenOfTheURL.setPage(9); + lastTokenOfTheURL.setX(530.9363448275863); + lastTokenOfTheURL.setY(538.153); + lastTokenOfTheURL.setWidth(4.363655172413793); + lastTokenOfTheURL.setHeight(9.702); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(9); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/lfoppiano/supercon2"); + annotation.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation); + List offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2")); + } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(19); + lastTokenOfTheURL.setPage(9); + lastTokenOfTheURL.setX(530.9363448275863); + lastTokenOfTheURL.setY(538.153); + lastTokenOfTheURL.setWidth(4.363655172413793); + lastTokenOfTheURL.setHeight(9.702); + + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(9); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("https://github.com/lfoppiano/supercon2"); + annotation.setType(PDFAnnotation.Type.URI); + List pdfAnnotations = List.of(annotation); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2")); + } +} From 621d1da03a53402e59ec8601aa376ae0ce931db0 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 12 Apr 2024 10:10:21 +0900 Subject: [PATCH 08/14] fix consistency in method names --- .../src/main/java/org/grobid/core/lexicon/Lexicon.java | 4 ++-- .../src/test/java/org/grobid/core/lexicon/LexiconTest.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index 247d161f5e..5bd5e642b9 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -1193,7 +1193,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation List pdfAnnotations, String text) { - List urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); + List urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text // which may be different (spaces, hypen, breakline) @@ -1203,7 +1203,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation /** * This method returns the token positions in respect of the layout tokens */ - public static List tokensPositionUrlPatternWithPdfAnnotations( + public static List tokenPositionUrlPatternWithPdfAnnotations( List layoutTokens, List pdfAnnotations) { diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index f9d1e4d7c4..7aab70c58a 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -163,7 +163,7 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr annotation.setType(PDFAnnotation.Type.URI); List pdfAnnotations = List.of(annotation); - List offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + List offsetPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); assertThat(offsetPositions, hasSize(1)); OffsetPosition url = offsetPositions.get(0); From 96073916bd5c486ecd28840a5dc8399182bbd156 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 12 Apr 2024 11:30:10 +0900 Subject: [PATCH 09/14] Update tests --- .../org/grobid/core/lexicon/LexiconTest.java | 81 +++++++++---------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index 7aab70c58a..abf407dbbf 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -7,6 +7,7 @@ import org.grobid.core.layout.PDFAnnotation; import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.utilities.OffsetPosition; +import org.junit.Ignore; import org.junit.Test; import java.util.ArrayList; @@ -17,41 +18,44 @@ import static org.hamcrest.Matchers.hasSize; public class LexiconTest { -// @Test -// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception { -// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject"; -// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); -// -// List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); -// -// assertThat(offsetPositions, hasSize(1)); -// OffsetPosition FirstURL = offsetPositions.get(0); -// assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject")); -// } -// -// @Test -// public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception { -// final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola."; -// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); -// -// List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); -// -// assertThat(offsetPositions, hasSize(2)); -// OffsetPosition url = offsetPositions.get(1); -// assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola")); -// } -// -// @Test -// public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception { -// final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)"; -// List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); -// -// List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); -// -// assertThat(offsetPositions, hasSize(1)); -// OffsetPosition url = offsetPositions.get(0); -// assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf")); -// } + @Test + public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception { + final String input = "This work was distributed on http:// github.com/myUsername/MyProject"; + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition FirstURL = offsetPositions.get(0); + assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/myUsername/MyProject")); + } + + @Test + public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception { + final String input = "This work was distributed on http:// github.com/myUsername/MyProject"; + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List offsetPositions = Lexicon.tokenPositionsUrlPattern(tokenisedInput); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition FirstURL = offsetPositions.get(0); + //Note: The intervals returned by the method Utilities.convertStringOffsetToTokenOffset + // consider the upper index to be included, while java consider the upper index to be excluded + assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject")); + } + + @Test + @Ignore("This test will fail, it can be used to test a real case when updating the regular exception") + public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception { + final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)"; + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf")); + } @Test public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception { @@ -111,9 +115,6 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC lastTokenOfTheURL.setWidth(4.363655172413793); lastTokenOfTheURL.setHeight(9.702); - //This is the actual text that is passed and is different from the layoutToken text. - final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; - PDFAnnotation annotation = new PDFAnnotation(); annotation.setPageNumber(9); List boundingBoxes = new ArrayList<>(); @@ -151,9 +152,6 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr lastTokenOfTheURL.setWidth(4.363655172413793); lastTokenOfTheURL.setHeight(9.702); - //This is the actual text that is passed and is different from the layoutToken text. - final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; - PDFAnnotation annotation = new PDFAnnotation(); annotation.setPageNumber(9); List boundingBoxes = new ArrayList<>(); @@ -191,7 +189,6 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC lastTokenOfTheURL.setWidth(4.363655172413793); lastTokenOfTheURL.setHeight(9.702); - PDFAnnotation annotation = new PDFAnnotation(); annotation.setPageNumber(9); List boundingBoxes = new ArrayList<>(); From 6ff15ee87db55c010b846e6a8a7120123534c7bf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 17 Apr 2024 08:30:20 +0700 Subject: [PATCH 10/14] keep convention on the token/character calculation --- .../main/java/org/grobid/core/lexicon/Lexicon.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index 5bd5e642b9..681a0da7a7 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -1195,24 +1195,24 @@ public static List characterPositionsUrlPatternWithPdfAnnotation List urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); + // We need to adjust the end of the positions to avoid problems with the sublist + // that is used the following method + urlTokensPositions.stream().forEach(o -> o.end += 1); + // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text // which may be different (spaces, hypen, breakline) return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions); } /** - * This method returns the token positions in respect of the layout tokens + * This method returns the token positions in respect of the layout tokens, + * the output token offsets are (included, included) */ public static List tokenPositionUrlPatternWithPdfAnnotations( List layoutTokens, List pdfAnnotations) { - List offsetPositions = convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); - // We need to adjust the end of the positions to avoid problems with the sublist - - offsetPositions.stream().forEach(o -> o.end += 1); - - return offsetPositions; + return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); } /** From 3900dc228b462dd951605ef31a7809fb18e6233c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:54:46 +0800 Subject: [PATCH 11/14] update test to follow the convention --- .../src/test/java/org/grobid/core/lexicon/LexiconTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index abf407dbbf..c70c930435 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -165,7 +165,8 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr assertThat(offsetPositions, hasSize(1)); OffsetPosition url = offsetPositions.get(0); - assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2")); + // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive + assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2")); } @Test From ec52f13948f854fc28c44f4a37f357c1ee9f44b2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 12:50:26 +0900 Subject: [PATCH 12/14] get fixes on matchTokenAndString from PR #1099 --- .../grobid/core/utilities/TextUtilities.java | 38 +++-- .../core/utilities/TextUtilitiesTest.java | 136 +++++++++++++++++- 2 files changed, 160 insertions(+), 14 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 87224cace8..f0e6cf03af 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1557,22 +1557,25 @@ public static org.apache.commons.lang3.tuple.Pair matchTokenAndString(List layoutTokens, String text, List urlPositions) { + public static List matchTokenAndString(List layoutTokens, String text, List positions) { List newPositions = new ArrayList<>(); StringBuilder accumulator = new StringBuilder(); int pos = 0; + int textPositionOfToken = 0; - for (OffsetPosition urlPosition : urlPositions) { - List urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end); + for (OffsetPosition position : positions) { + List annotationTokens = layoutTokens.subList(position.start, position.end); boolean first = true; - for (int i = 0; i < urlTokens.size(); i++) { - LayoutToken token = urlTokens.get(i); + accumulator = new StringBuilder(); + for (int i = 0; i < annotationTokens.size(); i++) { + LayoutToken token = annotationTokens.get(i); if (StringUtils.isEmpty(token.getText())) continue; - int newPos = text.indexOf(token.getText(), pos); - if (newPos != -1) { + textPositionOfToken = text.indexOf(token.getText(), pos); + if (textPositionOfToken != -1) { + //We update pos only at the first token of the annotation positions if (first) { - pos = newPos; + pos = textPositionOfToken; first = false; } accumulator.append(token); @@ -1581,16 +1584,25 @@ public static List matchTokenAndString(List layoutT continue; } if (StringUtils.isNotEmpty(accumulator)) { + int accumulatorTextLength = accumulator.toString().length(); int start = text.indexOf(accumulator.toString(), pos); - newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); - accumulator = new StringBuilder(); - pos = newPos; - first = true; + int end = start + accumulatorTextLength; + newPositions.add(new OffsetPosition(start, end)); + pos = end; break; } - pos = newPos; + pos = textPositionOfToken; } } + if (StringUtils.isNotEmpty(accumulator)) { + int annotationTextLength = accumulator.toString().length(); + int start = text.indexOf(accumulator.toString(), pos); + int end = start + annotationTextLength; + newPositions.add(new OffsetPosition(start, end)); + pos = end; + accumulator = new StringBuilder(); + } + } if (StringUtils.isNotEmpty(accumulator)) { int start = text.indexOf(accumulator.toString(), pos); diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index 6303dc6450..8b53cc263e 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -13,7 +13,6 @@ import java.util.regex.Matcher; import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.CoreMatchers.startsWith; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.*; @@ -436,4 +435,139 @@ public void testMatchTokenAndString() throws Exception { assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); } + + + @Test + public void testMatchTokenAndString_twoElements() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + List urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23)); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens); + + assertThat(offsetPositions, hasSize(2)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(url0.start, is(0)); + assertThat(url0.end, is(9)); + + assertThat(inputReal.substring(url0.start, url0.end), is("This work")); + + OffsetPosition url1 = offsetPositions.get(1); + assertThat(url1.start, is(26)); + assertThat(url1.end, is(65)); + + assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); + + } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception { + final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List urlTokens = Arrays.asList( + new OffsetPosition(0, 3), + new OffsetPosition(5, 8), + new OffsetPosition(10, 13), + new OffsetPosition(15, 18) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens); + + assertThat(offsetPositions, hasSize(4)); + + OffsetPosition url0 = offsetPositions.get(0); + assertThat(url0.start, is(0)); + assertThat(url0.end, is(19)); + + assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne")); + + OffsetPosition url1 = offsetPositions.get(1); + assertThat(url1.start, is(21)); + assertThat(url1.end, is(34)); + + assertThat(input.substring(url1.start, url1.end), is("Claudie Marec")); + + OffsetPosition url2 = offsetPositions.get(2); + assertThat(url2.start, is(36)); + assertThat(url2.end, is(49)); + + assertThat(input.substring(url2.start, url2.end), is("Claudie Marec")); + + OffsetPosition url3 = offsetPositions.get(3); + assertThat(url3.start, is(51)); + assertThat(url3.end, is(66)); + + assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder")); + + } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception { + final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List annotationTokenPositions = Arrays.asList( + new OffsetPosition(4, 7), + new OffsetPosition(9, 12), + new OffsetPosition(15, 18), + new OffsetPosition(27, 30), + new OffsetPosition(49, 52), + new OffsetPosition(71, 74), + new OffsetPosition(103, 106), + new OffsetPosition(109, 110), + new OffsetPosition(125, 126) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions); + + assertThat(offsetPositions, hasSize(9)); + + OffsetPosition url7 = offsetPositions.get(7); + assertThat(url7.start, is(349)); + assertThat(url7.end, is(352)); + + assertThat(input.substring(url7.start, url7.end), is("IGC")); + + OffsetPosition url8 = offsetPositions.get(8); + assertThat(url8.start, is(397)); + assertThat(url8.end, is(400)); + + assertThat(input.substring(url8.start, url8.end), is("IGC")); + + } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception { + final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List annotationTokenPositions = Arrays.asList( + new OffsetPosition(4, 7), + new OffsetPosition(40, 41), + new OffsetPosition(62, 63), + new OffsetPosition(79, 84) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions); + + assertThat(offsetPositions, hasSize(4)); + + OffsetPosition url7 = offsetPositions.get(1); + assertThat(input.substring(url7.start, url7.end), is("NERC")); + + OffsetPosition url8 = offsetPositions.get(2); + assertThat(input.substring(url8.start, url8.end), is("ERC")); + } } From f983f2548813a02e9dc1a0b37e5fefe1eafd4abb Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 9 May 2024 12:17:59 +0900 Subject: [PATCH 13/14] Add additional test and fix to the method so that the offsets are correctly matching the real text (dehypenised) --- .../grobid/core/document/TEIFormatter.java | 2 +- .../org/grobid/core/lexicon/LexiconTest.java | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 20a7746388..7283a2e513 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1873,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations); + List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text); forbiddenPositions.addAll(offsetPositionsUrls); List theSentences = diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index c70c930435..8b3b501488 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -208,4 +208,40 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC OffsetPosition url = offsetPositions.get(0); assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2")); } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { + final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" + + "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" + + "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" + + "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" + + "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" + + "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(97); + lastTokenOfTheURL.setPage(19); + lastTokenOfTheURL.setX(465.54675000000003); + lastTokenOfTheURL.setY(404.908); + lastTokenOfTheURL.setWidth(68.727); + lastTokenOfTheURL.setHeight(9.0873); + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(19); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("http://www.gencodegenes.org/releases/"); + annotation.setType(PDFAnnotation.Type.URI); + List pdfAnnotations = List.of(annotation); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. "; + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/")); + } } From 617aa16a29ccd578c5734d42c1a92fdfce01b811 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 9 May 2024 17:11:07 +0900 Subject: [PATCH 14/14] Apply url preservation also in tables description and notes --- grobid-core/src/main/java/org/grobid/core/data/Table.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 6356978837..14d468418c 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } if (desc != null && config.isWithSentenceSegmentation()) { - formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); // we need a sentence segmentation of the table caption, for that we need to introduce // a

, then a

@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form if (noteNode != null && config.isWithSentenceSegmentation()) { // we need a sentence segmentation of the figure caption - formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } // enclose note content in a

element