From 9d9455ae9d663822c76a1f94a66dd3375d986e74 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 9 Apr 2024 07:59:21 +0900
Subject: [PATCH 01/14] add URL detection to avoid split them when running the
 sentence segmenter

---
 .../src/main/java/org/grobid/core/document/TEIFormatter.java  | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index f66baaa0c0..33affa2d03 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -28,6 +28,7 @@
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.lang.Language;
 import org.grobid.core.layout.*;
+import org.grobid.core.lexicon.Lexicon;
 import org.grobid.core.utilities.SentenceUtilities;
 import org.grobid.core.tokenization.TaggingTokenCluster;
 import org.grobid.core.tokenization.TaggingTokenClusteror;
@@ -1868,6 +1869,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.getInstance().characterPositionsUrlPattern(curParagraphTokens);
+        forbiddenPositions.addAll(offsetPositionsUrls);
+
         List<OffsetPosition> theSentences = 
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
     

From cff813863dbff4d31552d15a8e9e93fdf388d27c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 11 Apr 2024 18:14:03 +0900
Subject: [PATCH 02/14] update lexicon and add more integration tests

---
 .../java/org/grobid/core/lexicon/Lexicon.java | 173 ++++++++++++------
 .../core/lexicon/LexiconIntegrationTest.java  | 159 +++++++++++++++-
 2 files changed, 270 insertions(+), 62 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index 271dd6fa77..9666ad0c85 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -20,7 +20,9 @@
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 
+import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.exceptions.GrobidResourceException;
 import org.grobid.core.lang.Language;
@@ -36,7 +38,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.commons.lang3.tuple.Pair;
+import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset;
 
 /**
  * Class for managing all the lexical resources.
@@ -101,19 +103,19 @@ private Lexicon() {
         initDictionary();
         initNames();
 		// the loading of the journal and conference names is lazy
-        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"wordforms"+File.separator+"english.wf", Language.EN);
-        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"wordforms"+File.separator+"german.wf", Language.EN);
         addLastNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"names.family");
 		addLastNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"lastname.5k");
-        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"names.female");
-        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"names.male");
-		addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + 
+		addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"firstname.5k");
         initCountryCodes();
         addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator +
@@ -465,33 +467,33 @@ public void initOrganisations() {
         try {
             organisationPattern = new FastMatcher(new
                     File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst"));
-			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + 
+			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() +
 				"/lexicon/organisations/government.government_agency"));
-			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + 
+			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() +
 				"/lexicon/organisations/known_corporations.lst"));
-			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + 
+			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() +
 				"/lexicon/organisations/venture_capital.venture_funded_company"));
         } catch (PatternSyntaxException e) {
             throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e);
         } catch (IOException e) {
-            throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " + 
+            throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " +
 				"does not exist or cannot be read.", e);
         } catch (Exception e) {
 			throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
 		}
     }
-	
+
 	public void initOrgForms() {
         try {
 			orgFormPattern = new FastMatcher(new
-                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt"));	
+                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt"));
         } catch (PatternSyntaxException e) {
             throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e);
         } catch (Exception e) {
 			throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
 		}
     }
-	
+
 	public void initLocations() {
         try {
             locationPattern = new FastMatcher(new
@@ -522,8 +524,8 @@ public void initPersonSuffix() {
     public void initFunders() {
         try {
             funderPattern = new FastMatcher(new
-                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), 
-                    GrobidAnalyzer.getInstance(), true); 
+                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"),
+                    GrobidAnalyzer.getInstance(), true);
         } catch (PatternSyntaxException e) {
             throw new GrobidResourceException("Error when compiling lexicon matcher for funders.", e);
         } catch (Exception e) {
@@ -534,8 +536,8 @@ public void initFunders() {
     public void initResearchInfrastructures() {
         try {
             researchInfrastructurePattern = new FastMatcher(new
-                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), 
-                    GrobidAnalyzer.getInstance(), true); 
+                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"),
+                    GrobidAnalyzer.getInstance(), true);
             // store some name mapping
             researchOrganizations = new TreeMap<>();
 
@@ -563,7 +565,7 @@ public void initResearchInfrastructures() {
                     String[] pieces = line.split(";", -1); // -1 for getting empty tokens too
                     if (pieces.length == 3) {
                         if (pieces[0].length() > 0) {
-                            
+
                             if (pieces[1].length() > 0) {
                                 OrganizationRecord localInfra = new OrganizationRecord(pieces[0], pieces[1], "en");
                                 List<OrganizationRecord> localInfraList = researchOrganizations.get(pieces[0].toLowerCase());
@@ -608,7 +610,7 @@ public void initResearchInfrastructures() {
             throw new GrobidResourceException("Error when compiling lexicon matcher for research infrastructure.", e);
         } catch (Exception e) {
             throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
-        }    
+        }
     }
 
     /**
@@ -642,15 +644,15 @@ public boolean isPunctuation(String s) {
     public List<OrganizationRecord> getOrganizationNamingInfo(String name) {
         if (researchOrganizations == null)
             return null;
-        return researchOrganizations.get(name.toLowerCase()); 
+        return researchOrganizations.get(name.toLowerCase());
     }
 
     /**
      * Map the language codes used by the language identifier component to the normal
      * language name.
      *
-     * Note: due to an older bug, kr is currently map to Korean too - this should 
-     * disappear at some point in the future after retraining of models 
+     * Note: due to an older bug, kr is currently map to Korean too - this should
+     * disappear at some point in the future after retraining of models
      *
      * @param code the language to be mapped
      */
@@ -896,7 +898,7 @@ public List<OffsetPosition> charPositionsOrganisationNames(String s) {
 
     /**
      * Soft look-up in organisation names gazetteer for a tokenize sequence.
-     * It return a list of positions referring to the character positions within the input 
+     * It return a list of positions referring to the character positions within the input
      * sequence.
      *
      * @param s the input list of LayoutToken
@@ -987,7 +989,7 @@ public List<OffsetPosition> tokenPositionsLocationNames(List<LayoutToken> s) {
     }
 
     /**
-     * Soft look-up in location name gazetteer for a string, return a list of positions referring 
+     * Soft look-up in location name gazetteer for a string, return a list of positions referring
      * to the character positions within the string.
      *
      * For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19)
@@ -1004,7 +1006,7 @@ public List<OffsetPosition> charPositionsLocationNames(String s) {
     }
 
     /**
-     * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of 
+     * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of
      * positions referring to the character positions in the input sequence.
      *
      * For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19)
@@ -1092,7 +1094,7 @@ public List<OffsetPosition> charPositionsPersonTitle(List<LayoutToken> s) {
     public List<OffsetPosition> tokenPositionsIdentifierPattern(List<LayoutToken> tokens) {
         List<OffsetPosition> result = new ArrayList<OffsetPosition>();
         String text = LayoutTokensUtil.toText(tokens);
-        
+
         // DOI positions
         result = tokenPositionsDOIPattern(tokens, text);
 
@@ -1115,10 +1117,10 @@ public List<OffsetPosition> tokenPositionsIdentifierPattern(List<LayoutToken> to
     public List<OffsetPosition> tokenPositionsDOIPattern(List<LayoutToken> tokens, String text) {
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher doiMatcher = TextUtilities.DOIPattern.matcher(text);
-        while (doiMatcher.find()) {            
+        while (doiMatcher.find()) {
             textResult.add(new OffsetPosition(doiMatcher.start(), doiMatcher.end()));
         }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
     /**
@@ -1128,11 +1130,11 @@ public List<OffsetPosition> tokenPositionsDOIPattern(List<LayoutToken> tokens, S
     public List<OffsetPosition> tokenPositionsArXivPattern(List<LayoutToken> tokens, String text) {
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher arXivMatcher = TextUtilities.arXivPattern.matcher(text);
-        while (arXivMatcher.find()) {  
+        while (arXivMatcher.find()) {
             //System.out.println(arXivMatcher.start() + " / " + arXivMatcher.end() + " / " + text.substring(arXivMatcher.start(), arXivMatcher.end()));                 
             textResult.add(new OffsetPosition(arXivMatcher.start(), arXivMatcher.end()));
         }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
 
@@ -1141,7 +1143,7 @@ public List<OffsetPosition> tokenPositionsArXivPattern(List<LayoutToken> tokens,
      */
     public List<OffsetPosition> tokenPositionsISSNPattern(List<LayoutToken> tokens) {
         List<OffsetPosition> result = new ArrayList<OffsetPosition>();
-        
+
         // TBD !
 
         return result;
@@ -1161,50 +1163,99 @@ public List<OffsetPosition> tokenPositionsISBNPattern(List<LayoutToken> tokens)
     /**
      * Identify in tokenized input the positions of an URL pattern with token positions
      */
-    public List<OffsetPosition> tokenPositionsUrlPattern(List<LayoutToken> tokens) {
-        //List<OffsetPosition> result = new ArrayList<OffsetPosition>();
-        String text = LayoutTokensUtil.toText(tokens);
-        List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
-        Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
-        while (urlMatcher.find()) {  
-            //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end()));                 
-            textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
-        }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+    public static List<OffsetPosition> tokenPositionsUrlPattern(List<LayoutToken> tokens) {
+        List<OffsetPosition> textResult = characterPositionsUrlPattern(tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
     /**
      * Identify in tokenized input the positions of an URL pattern with character positions
      */
-    public List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken> tokens) {
-        //List<OffsetPosition> result = new ArrayList<OffsetPosition>();
+    public static List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken> tokens) {
         String text = LayoutTokensUtil.toText(tokens);
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
-        while (urlMatcher.find()) {  
+        while (urlMatcher.find()) {
             textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
         }
         return textResult;
     }
 
     /**
-     * Identify in tokenized input the positions of an URL pattern with character positions, 
+     * Identify in tokenized input the positions of a URL pattern with character positions,
      * and refine positions based on possible PDF URI annotations.
-     * 
+     *
      * This will produce better quality recognized URL, avoiding missing suffixes and problems
      * with break lines and spaces.
      **/
+//    public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotations(
+//        List<LayoutToken> layoutTokens,
+//        List<PDFAnnotation> pdfAnnotations,
+//        String text) {
+//
+//        List<OffsetPosition> urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
+//
+//        // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
+//        // which may be different (spaces, hypen, breakline)
+//        StringBuilder accumulator = new StringBuilder();
+//        List<String> tokenizedText = GrobidAnalyzer.getInstance().tokenize(text);
+//
+//        for (OffsetPosition urlOffsetPosition : urlTokensPositions) {
+//            int startTokenPosition = urlOffsetPosition.start;
+//            int endTokenPosition = urlOffsetPosition.end;
+//
+//            List<LayoutToken> urlTokens = layoutTokens.subList(startTokenPosition, endTokenPosition);
+//
+//            int tokenIndex = 0;
+//            int startPosition = 0;
+//            int endPosition = 0;
+//            for (LayoutToken token : urlTokens) {
+//                String tokenText = token.getText();
+//                int textIndex = 0;
+//                for (int i = tokenIndex; i <tokenizedText.size(); i++) {
+//                    String textPiece = tokenizedText.get(i);
+//                    if (textPiece.equals(tokenText)) {
+//                        startPosition = accumulator.toString().length();
+//                        endPosition = startPosition + textPiece.length();
+//                        accumulator.append(textPiece);
+//                        tokenIndex = i + 1;
+//                        break;
+//                    } else {
+//
+//                    }
+//                    accumulator.append(textPiece);
+//                    textIndex += 1;
+//                    tokenIndex += 1;
+//                }
+//            }
+//        }
+//        return null;
+//
+//    }
+
+    /**
+     * This method returns the token positions in respect of the layout tokens
+     */
+    public static List<OffsetPosition> tokensPositionUrlPatternWithPdfAnnotations(
+        List<LayoutToken> layoutTokens,
+        List<PDFAnnotation> pdfAnnotations) {
+
+        return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
+    }
+
+    /**
+     * This method returns the character offsets in relation to the string obtained by the layout tokens.
+     * Notice the absence of the String text parameter.
+     */
     public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotations(
-                                    List<LayoutToken> layoutTokens, 
-                                    List<PDFAnnotation> pdfAnnotations, 
-                                    String text) {
-        List<OffsetPosition> urlPositions = Lexicon.getInstance().characterPositionsUrlPattern(layoutTokens);
+                                    List<LayoutToken> layoutTokens,
+                                    List<PDFAnnotation> pdfAnnotations) {
+        List<OffsetPosition> urlPositions = Lexicon.characterPositionsUrlPattern(layoutTokens);
         List<OffsetPosition> resultPositions = new ArrayList<>();
 
         // do we need to extend the url position based on additional position of the corresponding 
         // PDF annotation?
         for(OffsetPosition urlPosition : urlPositions) {
-
             int startPos = urlPosition.start;
             int endPos = urlPosition.end;
 
@@ -1230,11 +1281,10 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                 tokenIndex++;
             }
 
-            //String urlString = LayoutTokensUtil.toText(urlTokens);
-            String urlString = text.substring(startPos, endPos);
+            String urlString = LayoutTokensUtil.toText(urlTokens);
 
             PDFAnnotation targetAnnotation = null;
-            if (urlTokens.size()>0) {
+            if (CollectionUtils.isNotEmpty(urlTokens)) {
                 LayoutToken lastToken = urlTokens.get(urlTokens.size()-1);
                 if (pdfAnnotations != null) {
                     for (PDFAnnotation pdfAnnotation : pdfAnnotations) {
@@ -1253,7 +1303,7 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                 String destination = targetAnnotation.getDestination();
 
                 int destinationPos = 0;
-                if (destination.indexOf(urlString) != -1) {
+                if (destination.contains(urlString)) {
                     destinationPos = destination.indexOf(urlString)+urlString.length();
                 }
 
@@ -1261,7 +1311,7 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                     for(int j=endTokensIndex+1; j<layoutTokens.size(); j++) {
                         LayoutToken nextToken = layoutTokens.get(j);
 
-                        if ("\n".equals(nextToken.getText()) || 
+                        if ("\n".equals(nextToken.getText()) ||
                             " ".equals(nextToken.getText()) ||
                             nextToken.getText().length() == 0) {
                             endPos += nextToken.getText().length();
@@ -1274,15 +1324,16 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                             endPos += nextToken.getText().length();
                             destinationPos = pos + nextToken.getText().length();
                             urlTokens.add(nextToken);
-                        } else 
+                        } else
                             break;
                     }
                 }
             }
 
             // finally avoid ending a URL by a dot, because it can harm the sentence segmentation
-            if (text.charAt(endPos-1) == '.') 
-                endPos = endPos-1;
+            if (StringUtils.substring(LayoutTokensUtil.toText(layoutTokens), startPos, endPos).endsWith(".")) {
+                endPos = endPos - 1;
+            }
 
             OffsetPosition position = new OffsetPosition();
             position.start = startPos;
@@ -1303,11 +1354,11 @@ public List<OffsetPosition> tokenPositionsEmailPattern(List<LayoutToken> tokens)
             return new ArrayList<OffsetPosition>();
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher emailMatcher = TextUtilities.emailPattern.matcher(text);
-        while (emailMatcher.find()) {  
+        while (emailMatcher.find()) {
             //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end()));                 
             textResult.add(new OffsetPosition(emailMatcher.start(), emailMatcher.end()));
         }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
 }
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
index 620f01a73a..9f42a7ce7a 100755
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
@@ -1,13 +1,16 @@
 package org.grobid.core.lexicon;
 
+import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.layout.PDFAnnotation;
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.LayoutTokensUtil;
 import org.grobid.core.layout.LayoutToken;
-import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.Test;
 
+import java.util.ArrayList;
 import java.util.List;
 
 import static org.hamcrest.CoreMatchers.is;
@@ -407,4 +410,158 @@ public void testinFunders1Match() throws Exception {
         assertThat(positions.get(0).start, is(4));
         assertThat(positions.get(0).end, is(6));
     }
+
+//    @Test
+//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject";
+//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+//
+//        List<OffsetPosition> offsetPositions = target.characterPositionsUrlPattern(tokenisedInput);
+//
+//        assertThat(offsetPositions, hasSize(1));
+//        OffsetPosition FirstURL = offsetPositions.get(0);
+//        assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject"));
+//    }
+//
+//    @Test
+//    public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception {
+//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola.";
+//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+//
+//        List<OffsetPosition> offsetPositions = target.characterPositionsUrlPattern(tokenisedInput);
+//
+//        assertThat(offsetPositions, hasSize(2));
+//        OffsetPosition url = offsetPositions.get(1);
+//        assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola"));
+//    }
+//
+//    @Test
+//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
+//        final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
+//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+//
+//        List<OffsetPosition> offsetPositions = target.characterPositionsUrlPattern(tokenisedInput);
+//
+//        assertThat(offsetPositions, hasSize(1));
+//        OffsetPosition url = offsetPositions.get(0);
+//        assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
+//    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "1. 'internal status' indicates that their records should be \n" +
+            "hidden in the interface. \n" +
+            "2. In our previous work [1] we reported 77.03% F1-\n" +
+            "score. There is a slight decrease in absolute scores \n" +
+            "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" +
+            "may be the use of different hyperparameters in \n" +
+            "version 0.3.0 such as batch size and learning rate. \n" +
+            "However, the most probable cause could be the \n" +
+            "impact of using the Huggingface tokenizers \n" +
+            "library which is suffering from quality issues \n" +
+            "https://github.com/kermitt2/delft/issues/150. \n" +
+            "\n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(10);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68,177.39,9.52));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/kermitt2/delft/issues/150");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
+    }
 }
\ No newline at end of file

From dcda0dc51bab163020d41690733c52be365989c9 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 11 Apr 2024 18:14:22 +0900
Subject: [PATCH 03/14] typos

---
 .../org/grobid/core/engines/FundingAcknowledgementParser.java | 4 ++--
 .../src/main/java/org/grobid/core/utilities/Utilities.java    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
index c92b270ff1..1068be3e28 100644
--- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
@@ -102,7 +102,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
     /**
      * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) 
      * will be processed in this segment and paragraph element will be replaced with the processed content.
-     * Resulting entities are relative to the whole procssed XML segment.
+     * Resulting entities are relative to the whole processed XML segment.
      * 
      * Tokenization is done with the default Grobid analyzer triggered by the identified language. 
      **/
@@ -178,7 +178,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
     /**
      * The processing here is called from the header and/or full text parser in cascade
      * when one of these higher-level model detect a "funding" section, or in case
-     * no funding section is found, when a acknolwedgements section is detected.
+     * no funding section is found, when an acknolwedgement section is detected.
      * 
      * Independently from the place this parser is called, it process the input sequence 
      * of layout tokens in a context free manner. 
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
index de88ab3aa4..d4a838a5df 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
@@ -504,7 +504,7 @@ public static List<OffsetPosition> convertStringOffsetToTokenOffsetOld(
 	}
 
 	/**
-	 * This version uses actual LayoutToken offsets relative to the tokens present in argment only.
+	 * This version uses actual LayoutToken offsets relative to the tokens present in argument only.
 	 * It supposes that the stringPosition have been identified on the provided tokens only, and not 
 	 * restricted to the complete document.
 	 */

From a3cc84e464f181bc4d63879f949c66d8a65c2590 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 11 Apr 2024 20:47:28 +0900
Subject: [PATCH 04/14] Add test

---
 .../core/lexicon/LexiconIntegrationTest.java  | 40 +++++++++++++++++++
 .../grobid/core/utilities/UtilitiesTest.java  | 19 +++++++++
 2 files changed, 59 insertions(+)

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
index 9f42a7ce7a..e312871c08 100755
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
@@ -524,6 +524,46 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2"));
     }
 
+    @Test
+    public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2"));
+    }
+
     @Test
     public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
         final String input = "This work is available at https://github.com/lfoppiano/ \n" +
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
index 64d6d4be7a..9e5a6958ff 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
@@ -3,10 +3,17 @@
 import java.io.File;
 import java.io.IOException;
 
+import java.util.Arrays;
 import java.util.List;
 import java.util.ArrayList;
 
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.LayoutToken;
 import org.junit.Test;
+
+import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.*;
 
 public class UtilitiesTest {
@@ -112,4 +119,16 @@ public void testMergePositionsOverlap() throws IOException {
 		assertEquals(positions.get(1).start, 7);		
 		assertEquals(positions.get(1).end, 10);
 	}
+
+    @Test
+    public void testConvertStringOffsetToTokenOffset() throws Exception {
+        String input = "This is a token.";
+        List<LayoutToken> layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        OffsetPosition stringPosition = new OffsetPosition(5, 9);
+        List<OffsetPosition> tokenOffsets = convertStringOffsetToTokenOffset(Arrays.asList(stringPosition), layoutTokens);
+
+        assertThat(tokenOffsets, hasSize(1));
+        OffsetPosition position = tokenOffsets.get(0);
+        assertThat(LayoutTokensUtil.toText(layoutTokens.subList(position.start, position.end + 1)), is("is a"));
+    }
 }

From ddd9336d3e42e97e9aa5622886ab4c60fedd174c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 11 Apr 2024 20:48:14 +0900
Subject: [PATCH 05/14] improvements

---
 .../main/java/org/grobid/core/engines/FullTextParser.java    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
index 28eda7e693..953d92f8b1 100755
--- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -11,6 +11,7 @@
 
 import java.nio.charset.StandardCharsets;
 
+import org.apache.lucene.util.CollectionUtil;
 import org.grobid.core.GrobidModels;
 import org.grobid.core.data.*;
 import org.grobid.core.document.Document;
@@ -478,7 +479,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
         List<LayoutToken> currentChunk = new ArrayList<>();
         int currentPos = 0;
         for(LayoutToken token : tokens) {
-            if (currentChunk.size() != 0) {
+            if (CollectionUtils.isNotEmpty(currentChunk)) {
                 int tokenPos = token.getOffset();
                 if (currentPos != tokenPos) {
                     // new chunk
@@ -508,7 +509,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
             LayoutTokenization layouts = featSeg.getRight();
             if (layouts != null)
                 layoutTokenization = layouts.getTokenization();
-            if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
+            if (StringUtils.isNotBlank(featuredText)) {
                 res = label(featuredText);
                 res = postProcessFullTextLabeledText(res);
             }

From ca3c3524ae4422b56f23d9e2cefef8174a926e8b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 12 Apr 2024 06:27:22 +0900
Subject: [PATCH 06/14] add method to match the offset from the layout token
 raw string to a "postprocessed" text

---
 .../java/org/grobid/core/data/Figure.java     |  2 +-
 .../grobid/core/document/TEIFormatter.java    | 22 ++++---
 .../java/org/grobid/core/lexicon/Lexicon.java | 62 +++++--------------
 .../grobid/core/utilities/TextUtilities.java  | 43 +++++++++++++
 .../core/utilities/TextUtilitiesTest.java     | 29 +++++++++
 5 files changed, 103 insertions(+), 55 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
index e9417e9217..ef4117e93c 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -432,7 +432,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
             }
 
             if (desc != null && config.isWithSentenceSegmentation()) {
-                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                 // we need a sentence segmentation of the figure caption, for that we need to introduce 
                 // a <div>, then a <p>
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 33affa2d03..20a7746388 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1321,7 +1321,7 @@ private StringBuilder toTEINote(StringBuilder tei,
 
 
             if (config.isWithSentenceSegmentation()) {
-                segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage());
+                segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
             }
 
             desc.appendChild(pNote);
@@ -1523,7 +1523,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
@@ -1551,7 +1551,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 } else {
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
@@ -1767,7 +1767,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
 
         // in case we segment paragraph into sentences, we still need to do it for the last paragraph 
         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
         }
 
         // remove possibly empty div in the div list
@@ -1834,6 +1834,10 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curP
     }
 
     public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang) {
+        segmentIntoSentences(curParagraph, curParagraphTokens, config, lang, new ArrayList<>());
+    }
+
+    public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang, List<PDFAnnotation> annotations) {
         // in order to avoid having a sentence boundary in the middle of a ref element 
         // (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
         // we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
@@ -1842,7 +1846,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // in xom, the following gives all the text under the element, for the whole subtree
         String text = curParagraph.getValue();
-        if (text == null || text.length() == 0)
+        if (StringUtils.isEmpty(text))
             return;
 
         // identify ref nodes, ref spans and ref positions
@@ -1859,8 +1863,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 // for readability in another conditional
                 if (((Element) theNode).getLocalName().equals("ref")) {
                     // map character offset of the node
-                    mapRefNodes.put(Integer.valueOf(pos), theNode);
-                    refPositions.add(Integer.valueOf(pos));
+                    mapRefNodes.put(pos, theNode);
+                    refPositions.add(pos);
 
                     String chunk = theNode.getValue();
                     forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length()));
@@ -1869,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
-        List<OffsetPosition> offsetPositionsUrls = Lexicon.getInstance().characterPositionsUrlPattern(curParagraphTokens);
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations);
         forbiddenPositions.addAll(offsetPositionsUrls);
 
         List<OffsetPosition> theSentences = 
@@ -1894,7 +1898,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
             for(int i=0; i<curParagraphTokens.size(); i++) {
                 LayoutToken token = curParagraphTokens.get(i);
-                if (token.getText() == null || token.getText().length() == 0) 
+                if (StringUtils.isEmpty(token.getText()))
                     continue;
                 int newPos = sentenceChunk.indexOf(token.getText(), pos);
                 if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {
diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index 9666ad0c85..247d161f5e 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -1188,50 +1188,17 @@ public static List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken
      * This will produce better quality recognized URL, avoiding missing suffixes and problems
      * with break lines and spaces.
      **/
-//    public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotations(
-//        List<LayoutToken> layoutTokens,
-//        List<PDFAnnotation> pdfAnnotations,
-//        String text) {
-//
-//        List<OffsetPosition> urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
-//
-//        // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
-//        // which may be different (spaces, hypen, breakline)
-//        StringBuilder accumulator = new StringBuilder();
-//        List<String> tokenizedText = GrobidAnalyzer.getInstance().tokenize(text);
-//
-//        for (OffsetPosition urlOffsetPosition : urlTokensPositions) {
-//            int startTokenPosition = urlOffsetPosition.start;
-//            int endTokenPosition = urlOffsetPosition.end;
-//
-//            List<LayoutToken> urlTokens = layoutTokens.subList(startTokenPosition, endTokenPosition);
-//
-//            int tokenIndex = 0;
-//            int startPosition = 0;
-//            int endPosition = 0;
-//            for (LayoutToken token : urlTokens) {
-//                String tokenText = token.getText();
-//                int textIndex = 0;
-//                for (int i = tokenIndex; i <tokenizedText.size(); i++) {
-//                    String textPiece = tokenizedText.get(i);
-//                    if (textPiece.equals(tokenText)) {
-//                        startPosition = accumulator.toString().length();
-//                        endPosition = startPosition + textPiece.length();
-//                        accumulator.append(textPiece);
-//                        tokenIndex = i + 1;
-//                        break;
-//                    } else {
-//
-//                    }
-//                    accumulator.append(textPiece);
-//                    textIndex += 1;
-//                    tokenIndex += 1;
-//                }
-//            }
-//        }
-//        return null;
-//
-//    }
+    public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotations(
+        List<LayoutToken> layoutTokens,
+        List<PDFAnnotation> pdfAnnotations,
+        String text) {
+
+        List<OffsetPosition> urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
+
+        // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
+        // which may be different (spaces, hypen, breakline)
+        return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions);
+    }
 
     /**
      * This method returns the token positions in respect of the layout tokens
@@ -1240,7 +1207,12 @@ public static List<OffsetPosition> tokensPositionUrlPatternWithPdfAnnotations(
         List<LayoutToken> layoutTokens,
         List<PDFAnnotation> pdfAnnotations) {
 
-        return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
+        List<OffsetPosition> offsetPositions = convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
+        // We need to adjust the end of the positions to avoid problems with the sublist
+
+        offsetPositions.stream().forEach(o -> o.end += 1);
+
+        return offsetPositions;
     }
 
     /**
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
index 06f69bcdee..87224cace8 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -1556,4 +1556,47 @@ public static org.apache.commons.lang3.tuple.Pair<OffsetPosition, OffsetPosition
         else
             return null;
     }
+
+    public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> urlPositions) {
+        List<OffsetPosition> newPositions = new ArrayList<>();
+        StringBuilder accumulator = new StringBuilder();
+        int pos = 0;
+
+        for (OffsetPosition urlPosition : urlPositions) {
+            List<LayoutToken> urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end);
+            boolean first = true;
+            for (int i = 0; i < urlTokens.size(); i++) {
+                LayoutToken token = urlTokens.get(i);
+                if (StringUtils.isEmpty(token.getText()))
+                    continue;
+                int newPos = text.indexOf(token.getText(), pos);
+                if (newPos != -1) {
+                    if (first) {
+                        pos = newPos;
+                        first = false;
+                    }
+                    accumulator.append(token);
+                } else {
+                    if (SentenceUtilities.toSkipToken(token.getText())) {
+                        continue;
+                    }
+                    if (StringUtils.isNotEmpty(accumulator)) {
+                        int start = text.indexOf(accumulator.toString(), pos);
+                        newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
+                        accumulator = new StringBuilder();
+                        pos = newPos;
+                        first = true;
+                        break;
+                    }
+                    pos = newPos;
+                }
+            }
+        }
+        if (StringUtils.isNotEmpty(accumulator)) {
+            int start = text.indexOf(accumulator.toString(), pos);
+            newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
+        }
+
+        return newPositions;
+    }
 }
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
index ff5ac7467b..6303dc6450 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
@@ -8,12 +8,14 @@
 import org.junit.Test;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.regex.Matcher;
 
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.CoreMatchers.startsWith;
 import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.*;
 
 
@@ -407,4 +409,31 @@ public void testOrcidPattern() {
             }
         }
     }
+
+    @Test
+    public void testMatchTokenAndString() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+        List<OffsetPosition> urlTokens = Arrays.asList(new OffsetPosition(10, 23));
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url1 = offsetPositions.get(0);
+        assertThat(url1.start, is(26));
+        assertThat(url1.end, is(65));
+        assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+    }
 }

From fbbf254b7ae8db3d33b6421488c742f008082a0b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 12 Apr 2024 06:36:09 +0900
Subject: [PATCH 07/14] Use a lexicon normal test for static methods

---
 .../core/lexicon/LexiconIntegrationTest.java  | 194 ----------------
 .../org/grobid/core/lexicon/LexiconTest.java  | 213 ++++++++++++++++++
 2 files changed, 213 insertions(+), 194 deletions(-)
 create mode 100644 grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
index e312871c08..2d888520ec 100755
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
@@ -410,198 +410,4 @@ public void testinFunders1Match() throws Exception {
         assertThat(positions.get(0).start, is(4));
         assertThat(positions.get(0).end, is(6));
     }
-
-//    @Test
-//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
-//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject";
-//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-//
-//        List<OffsetPosition> offsetPositions = target.characterPositionsUrlPattern(tokenisedInput);
-//
-//        assertThat(offsetPositions, hasSize(1));
-//        OffsetPosition FirstURL = offsetPositions.get(0);
-//        assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject"));
-//    }
-//
-//    @Test
-//    public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception {
-//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola.";
-//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-//
-//        List<OffsetPosition> offsetPositions = target.characterPositionsUrlPattern(tokenisedInput);
-//
-//        assertThat(offsetPositions, hasSize(2));
-//        OffsetPosition url = offsetPositions.get(1);
-//        assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola"));
-//    }
-//
-//    @Test
-//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
-//        final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
-//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-//
-//        List<OffsetPosition> offsetPositions = target.characterPositionsUrlPattern(tokenisedInput);
-//
-//        assertThat(offsetPositions, hasSize(1));
-//        OffsetPosition url = offsetPositions.get(0);
-//        assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
-//    }
-
-    @Test
-    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception {
-        final String input = "1. 'internal status' indicates that their records should be \n" +
-            "hidden in the interface. \n" +
-            "2. In our previous work [1] we reported 77.03% F1-\n" +
-            "score. There is a slight decrease in absolute scores \n" +
-            "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" +
-            "may be the use of different hyperparameters in \n" +
-            "version 0.3.0 such as batch size and learning rate. \n" +
-            "However, the most probable cause could be the \n" +
-            "impact of using the Huggingface tokenizers \n" +
-            "library which is suffering from quality issues \n" +
-            "https://github.com/kermitt2/delft/issues/150. \n" +
-            "\n" +
-            "\n";
-
-        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-
-        //This is the actual text that is passed and is different from the layoutToken text.
-        final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. ";
-
-        PDFAnnotation annotation = new PDFAnnotation();
-        annotation.setPageNumber(10);
-        List<BoundingBox> boundingBoxes = new ArrayList<>();
-        boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68,177.39,9.52));
-        annotation.setBoundingBoxes(boundingBoxes);
-        annotation.setDestination("https://github.com/kermitt2/delft/issues/150");
-        annotation.setType(PDFAnnotation.Type.URI);
-
-        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
-        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
-
-        assertThat(offsetPositions, hasSize(1));
-        OffsetPosition url = offsetPositions.get(0);
-        assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150"));
-    }
-
-    @Test
-    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
-        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
-            "supercon2. The repository contains the code of the \n" +
-            "SuperCon 2 interface, the curation workflow, and the \n" +
-            "\n" +
-            "Table 2. Data support, the number of entities for each label in \n" +
-            "each of the datasets used for evaluating the ML models. The \n" +
-            "base dataset is the original dataset described in [18], and the \n" +
-            "curation dataset is automatically collected based on the data-\n" +
-            "base corrections by the interface and manually corrected. \n" +
-            "\n";
-
-        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
-        lastTokenOfTheURL.setPage(9);
-        lastTokenOfTheURL.setX(530.9363448275863);
-        lastTokenOfTheURL.setY(538.153);
-        lastTokenOfTheURL.setWidth(4.363655172413793);
-        lastTokenOfTheURL.setHeight(9.702);
-
-        //This is the actual text that is passed and is different from the layoutToken text.
-        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
-
-        PDFAnnotation annotation = new PDFAnnotation();
-        annotation.setPageNumber(9);
-        List<BoundingBox> boundingBoxes = new ArrayList<>();
-        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49));
-        annotation.setBoundingBoxes(boundingBoxes);
-        annotation.setDestination("https://github.com/lfoppiano/supercon2");
-        annotation.setType(PDFAnnotation.Type.URI);
-
-        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
-        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
-
-        assertThat(offsetPositions, hasSize(1));
-        OffsetPosition url = offsetPositions.get(0);
-        assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2"));
-    }
-
-    @Test
-    public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
-        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
-            "supercon2. The repository contains the code of the \n" +
-            "SuperCon 2 interface, the curation workflow, and the \n" +
-            "\n" +
-            "Table 2. Data support, the number of entities for each label in \n" +
-            "each of the datasets used for evaluating the ML models. The \n" +
-            "base dataset is the original dataset described in [18], and the \n" +
-            "curation dataset is automatically collected based on the data-\n" +
-            "base corrections by the interface and manually corrected. \n" +
-            "\n";
-
-        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
-        lastTokenOfTheURL.setPage(9);
-        lastTokenOfTheURL.setX(530.9363448275863);
-        lastTokenOfTheURL.setY(538.153);
-        lastTokenOfTheURL.setWidth(4.363655172413793);
-        lastTokenOfTheURL.setHeight(9.702);
-
-        //This is the actual text that is passed and is different from the layoutToken text.
-        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
-
-        PDFAnnotation annotation = new PDFAnnotation();
-        annotation.setPageNumber(9);
-        List<BoundingBox> boundingBoxes = new ArrayList<>();
-        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49));
-        annotation.setBoundingBoxes(boundingBoxes);
-        annotation.setDestination("https://github.com/lfoppiano/supercon2");
-        annotation.setType(PDFAnnotation.Type.URI);
-
-        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
-        List<OffsetPosition> offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
-
-        assertThat(offsetPositions, hasSize(1));
-        OffsetPosition url = offsetPositions.get(0);
-        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2"));
-    }
-
-    @Test
-    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
-        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
-            "supercon2. The repository contains the code of the \n" +
-            "SuperCon 2 interface, the curation workflow, and the \n" +
-            "\n" +
-            "Table 2. Data support, the number of entities for each label in \n" +
-            "each of the datasets used for evaluating the ML models. The \n" +
-            "base dataset is the original dataset described in [18], and the \n" +
-            "curation dataset is automatically collected based on the data-\n" +
-            "base corrections by the interface and manually corrected. \n" +
-            "\n";
-
-        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
-        lastTokenOfTheURL.setPage(9);
-        lastTokenOfTheURL.setX(530.9363448275863);
-        lastTokenOfTheURL.setY(538.153);
-        lastTokenOfTheURL.setWidth(4.363655172413793);
-        lastTokenOfTheURL.setHeight(9.702);
-
-
-        PDFAnnotation annotation = new PDFAnnotation();
-        annotation.setPageNumber(9);
-        List<BoundingBox> boundingBoxes = new ArrayList<>();
-        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9,408.76,537.11,126.54,10.49));
-        annotation.setBoundingBoxes(boundingBoxes);
-        annotation.setDestination("https://github.com/lfoppiano/supercon2");
-        annotation.setType(PDFAnnotation.Type.URI);
-        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
-
-        //This is the actual text that is passed and is different from the layoutToken text.
-        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
-
-        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
-
-        assertThat(offsetPositions, hasSize(1));
-        OffsetPosition url = offsetPositions.get(0);
-        assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
-    }
 }
\ No newline at end of file
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
new file mode 100644
index 0000000000..f9d1e4d7c4
--- /dev/null
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -0,0 +1,213 @@
+package org.grobid.core.lexicon;
+
+import org.apache.commons.lang3.StringUtils;
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.layout.LayoutToken;
+import org.grobid.core.layout.PDFAnnotation;
+import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.OffsetPosition;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasSize;
+
+public class LexiconTest {
+//    @Test
+//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject";
+//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+//
+//        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+//
+//        assertThat(offsetPositions, hasSize(1));
+//        OffsetPosition FirstURL = offsetPositions.get(0);
+//        assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject"));
+//    }
+//
+//    @Test
+//    public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception {
+//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola.";
+//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+//
+//        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+//
+//        assertThat(offsetPositions, hasSize(2));
+//        OffsetPosition url = offsetPositions.get(1);
+//        assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola"));
+//    }
+//
+//    @Test
+//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
+//        final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
+//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+//
+//        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+//
+//        assertThat(offsetPositions, hasSize(1));
+//        OffsetPosition url = offsetPositions.get(0);
+//        assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
+//    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "1. 'internal status' indicates that their records should be \n" +
+            "hidden in the interface. \n" +
+            "2. In our previous work [1] we reported 77.03% F1-\n" +
+            "score. There is a slight decrease in absolute scores \n" +
+            "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" +
+            "may be the use of different hyperparameters in \n" +
+            "version 0.3.0 such as batch size and learning rate. \n" +
+            "However, the most probable cause could be the \n" +
+            "impact of using the Huggingface tokenizers \n" +
+            "library which is suffering from quality issues \n" +
+            "https://github.com/kermitt2/delft/issues/150. \n" +
+            "\n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(10);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68, 177.39, 9.52));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/kermitt2/delft/issues/150");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2"));
+    }
+
+    @Test
+    public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
+    }
+}

From 621d1da03a53402e59ec8601aa376ae0ce931db0 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 12 Apr 2024 10:10:21 +0900
Subject: [PATCH 08/14] fix consistency in method names

---
 .../src/main/java/org/grobid/core/lexicon/Lexicon.java        | 4 ++--
 .../src/test/java/org/grobid/core/lexicon/LexiconTest.java    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index 247d161f5e..5bd5e642b9 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -1193,7 +1193,7 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
         List<PDFAnnotation> pdfAnnotations,
         String text) {
 
-        List<OffsetPosition> urlTokensPositions = tokensPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
+        List<OffsetPosition> urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
 
         // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
         // which may be different (spaces, hypen, breakline)
@@ -1203,7 +1203,7 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
     /**
      * This method returns the token positions in respect of the layout tokens
      */
-    public static List<OffsetPosition> tokensPositionUrlPatternWithPdfAnnotations(
+    public static List<OffsetPosition> tokenPositionUrlPatternWithPdfAnnotations(
         List<LayoutToken> layoutTokens,
         List<PDFAnnotation> pdfAnnotations) {
 
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index f9d1e4d7c4..7aab70c58a 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -163,7 +163,7 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr
         annotation.setType(PDFAnnotation.Type.URI);
 
         List<PDFAnnotation> pdfAnnotations = List.of(annotation);
-        List<OffsetPosition> offsetPositions = Lexicon.tokensPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+        List<OffsetPosition> offsetPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
 
         assertThat(offsetPositions, hasSize(1));
         OffsetPosition url = offsetPositions.get(0);

From 96073916bd5c486ecd28840a5dc8399182bbd156 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 12 Apr 2024 11:30:10 +0900
Subject: [PATCH 09/14] Update tests

---
 .../org/grobid/core/lexicon/LexiconTest.java  | 81 +++++++++----------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index 7aab70c58a..abf407dbbf 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -7,6 +7,7 @@
 import org.grobid.core.layout.PDFAnnotation;
 import org.grobid.core.utilities.LayoutTokensUtil;
 import org.grobid.core.utilities.OffsetPosition;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.util.ArrayList;
@@ -17,41 +18,44 @@
 import static org.hamcrest.Matchers.hasSize;
 
 public class LexiconTest {
-//    @Test
-//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
-//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject";
-//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-//
-//        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
-//
-//        assertThat(offsetPositions, hasSize(1));
-//        OffsetPosition FirstURL = offsetPositions.get(0);
-//        assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/ myUsername/ MyProject"));
-//    }
-//
-//    @Test
-//    public void testCharacterPositionsUrlPattern_two_URL_shouldReturnCorrectInterval() throws Exception {
-//        final String input = "This work was distributed on http:// github.com/ myUsername/ MyProject. The data is available at https :// github.com/ superconductors/ hola.";
-//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-//
-//        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
-//
-//        assertThat(offsetPositions, hasSize(2));
-//        OffsetPosition url = offsetPositions.get(1);
-//        assertThat(input.substring(url.start, url.end), is("https :// github.com/ superconductors/ hola"));
-//    }
-//
-//    @Test
-//    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
-//        final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
-//        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
-//
-//        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
-//
-//        assertThat(offsetPositions, hasSize(1));
-//        OffsetPosition url = offsetPositions.get(0);
-//        assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
-//    }
+    @Test
+    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "This work was distributed on http:// github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/myUsername/MyProject"));
+    }
+
+    @Test
+    public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "This work was distributed on http:// github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.tokenPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        //Note: The intervals returned by the method Utilities.convertStringOffsetToTokenOffset
+        // consider the upper index to be included, while java consider the upper index to be excluded
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject"));
+    }
+
+    @Test
+    @Ignore("This test will fail, it can be used to test a real case when updating the regular exception")
+    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
+        final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
+    }
 
     @Test
     public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception {
@@ -111,9 +115,6 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         lastTokenOfTheURL.setWidth(4.363655172413793);
         lastTokenOfTheURL.setHeight(9.702);
 
-        //This is the actual text that is passed and is different from the layoutToken text.
-        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
-
         PDFAnnotation annotation = new PDFAnnotation();
         annotation.setPageNumber(9);
         List<BoundingBox> boundingBoxes = new ArrayList<>();
@@ -151,9 +152,6 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr
         lastTokenOfTheURL.setWidth(4.363655172413793);
         lastTokenOfTheURL.setHeight(9.702);
 
-        //This is the actual text that is passed and is different from the layoutToken text.
-        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
-
         PDFAnnotation annotation = new PDFAnnotation();
         annotation.setPageNumber(9);
         List<BoundingBox> boundingBoxes = new ArrayList<>();
@@ -191,7 +189,6 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         lastTokenOfTheURL.setWidth(4.363655172413793);
         lastTokenOfTheURL.setHeight(9.702);
 
-
         PDFAnnotation annotation = new PDFAnnotation();
         annotation.setPageNumber(9);
         List<BoundingBox> boundingBoxes = new ArrayList<>();

From 6ff15ee87db55c010b846e6a8a7120123534c7bf Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 17 Apr 2024 08:30:20 +0700
Subject: [PATCH 10/14] keep convention on the token/character calculation

---
 .../main/java/org/grobid/core/lexicon/Lexicon.java | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index 5bd5e642b9..681a0da7a7 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -1195,24 +1195,24 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
 
         List<OffsetPosition> urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
 
+        // We need to adjust the end of the positions to avoid problems with the sublist
+        // that is used the following method
+        urlTokensPositions.stream().forEach(o -> o.end += 1);
+
         // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
         // which may be different (spaces, hypen, breakline)
         return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions);
     }
 
     /**
-     * This method returns the token positions in respect of the layout tokens
+     * This method returns the token positions in respect of the layout tokens,
+     * the output token offsets are (included, included)
      */
     public static List<OffsetPosition> tokenPositionUrlPatternWithPdfAnnotations(
         List<LayoutToken> layoutTokens,
         List<PDFAnnotation> pdfAnnotations) {
 
-        List<OffsetPosition> offsetPositions = convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
-        // We need to adjust the end of the positions to avoid problems with the sublist
-
-        offsetPositions.stream().forEach(o -> o.end += 1);
-
-        return offsetPositions;
+        return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
     }
 
     /**

From 3900dc228b462dd951605ef31a7809fb18e6233c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 28 Apr 2024 09:54:46 +0800
Subject: [PATCH 11/14] update test to follow the convention

---
 .../src/test/java/org/grobid/core/lexicon/LexiconTest.java     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index abf407dbbf..c70c930435 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -165,7 +165,8 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr
 
         assertThat(offsetPositions, hasSize(1));
         OffsetPosition url = offsetPositions.get(0);
-        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2"));
+        // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2"));
     }
 
     @Test

From ec52f13948f854fc28c44f4a37f357c1ee9f44b2 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sat, 4 May 2024 12:50:26 +0900
Subject: [PATCH 12/14] get fixes on matchTokenAndString from PR #1099

---
 .../grobid/core/utilities/TextUtilities.java  |  38 +++--
 .../core/utilities/TextUtilitiesTest.java     | 136 +++++++++++++++++-
 2 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
index 87224cace8..f0e6cf03af 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -1557,22 +1557,25 @@ public static org.apache.commons.lang3.tuple.Pair<OffsetPosition, OffsetPosition
             return null;
     }
 
-    public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> urlPositions) {
+    public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> positions) {
         List<OffsetPosition> newPositions = new ArrayList<>();
         StringBuilder accumulator = new StringBuilder();
         int pos = 0;
+        int textPositionOfToken = 0;
 
-        for (OffsetPosition urlPosition : urlPositions) {
-            List<LayoutToken> urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end);
+        for (OffsetPosition position : positions) {
+            List<LayoutToken> annotationTokens = layoutTokens.subList(position.start, position.end);
             boolean first = true;
-            for (int i = 0; i < urlTokens.size(); i++) {
-                LayoutToken token = urlTokens.get(i);
+            accumulator = new StringBuilder();
+            for (int i = 0; i < annotationTokens.size(); i++) {
+                LayoutToken token = annotationTokens.get(i);
                 if (StringUtils.isEmpty(token.getText()))
                     continue;
-                int newPos = text.indexOf(token.getText(), pos);
-                if (newPos != -1) {
+                textPositionOfToken = text.indexOf(token.getText(), pos);
+                if (textPositionOfToken != -1) {
+                    //We update pos only at the first token of the annotation positions
                     if (first) {
-                        pos = newPos;
+                        pos = textPositionOfToken;
                         first = false;
                     }
                     accumulator.append(token);
@@ -1581,16 +1584,25 @@ public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutT
                         continue;
                     }
                     if (StringUtils.isNotEmpty(accumulator)) {
+                        int accumulatorTextLength = accumulator.toString().length();
                         int start = text.indexOf(accumulator.toString(), pos);
-                        newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
-                        accumulator = new StringBuilder();
-                        pos = newPos;
-                        first = true;
+                        int end = start + accumulatorTextLength;
+                        newPositions.add(new OffsetPosition(start, end));
+                        pos = end;
                         break;
                     }
-                    pos = newPos;
+                    pos = textPositionOfToken;
                 }
             }
+            if (StringUtils.isNotEmpty(accumulator)) {
+                int annotationTextLength = accumulator.toString().length();
+                int start = text.indexOf(accumulator.toString(), pos);
+                int end = start + annotationTextLength;
+                newPositions.add(new OffsetPosition(start, end));
+                pos = end;
+                accumulator = new StringBuilder();
+            }
+
         }
         if (StringUtils.isNotEmpty(accumulator)) {
             int start = text.indexOf(accumulator.toString(), pos);
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
index 6303dc6450..8b53cc263e 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
@@ -13,7 +13,6 @@
 import java.util.regex.Matcher;
 
 import static org.hamcrest.CoreMatchers.is;
-import static org.hamcrest.CoreMatchers.startsWith;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.*;
@@ -436,4 +435,139 @@ public void testMatchTokenAndString() throws Exception {
         assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
 
     }
+
+
+    @Test
+    public void testMatchTokenAndString_twoElements() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+        List<OffsetPosition> urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23));
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+        assertThat(offsetPositions, hasSize(2));
+        OffsetPosition url0 = offsetPositions.get(0);
+        assertThat(url0.start, is(0));
+        assertThat(url0.end, is(9));
+
+        assertThat(inputReal.substring(url0.start, url0.end), is("This work"));
+
+        OffsetPosition url1 = offsetPositions.get(1);
+        assertThat(url1.start, is(26));
+        assertThat(url1.end, is(65));
+
+        assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception {
+        final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> urlTokens = Arrays.asList(
+            new OffsetPosition(0, 3),
+            new OffsetPosition(5, 8),
+            new OffsetPosition(10, 13),
+            new OffsetPosition(15, 18)
+        );
+        
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens);
+
+        assertThat(offsetPositions, hasSize(4));
+        
+        OffsetPosition url0 = offsetPositions.get(0);
+        assertThat(url0.start, is(0));
+        assertThat(url0.end, is(19));
+
+        assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne"));
+
+        OffsetPosition url1 = offsetPositions.get(1);
+        assertThat(url1.start, is(21));
+        assertThat(url1.end, is(34));
+
+        assertThat(input.substring(url1.start, url1.end), is("Claudie Marec"));
+
+        OffsetPosition url2 = offsetPositions.get(2);
+        assertThat(url2.start, is(36));
+        assertThat(url2.end, is(49));
+
+        assertThat(input.substring(url2.start, url2.end), is("Claudie Marec"));
+
+        OffsetPosition url3 = offsetPositions.get(3);
+        assertThat(url3.start, is(51));
+        assertThat(url3.end, is(66));
+
+        assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception {
+        final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities.";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> annotationTokenPositions = Arrays.asList(
+            new OffsetPosition(4, 7),
+            new OffsetPosition(9, 12),
+            new OffsetPosition(15, 18),
+            new OffsetPosition(27, 30),
+            new OffsetPosition(49, 52),
+            new OffsetPosition(71, 74),
+            new OffsetPosition(103, 106),
+            new OffsetPosition(109, 110),
+            new OffsetPosition(125, 126)
+        );
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+        assertThat(offsetPositions, hasSize(9));
+
+        OffsetPosition url7 = offsetPositions.get(7);
+        assertThat(url7.start, is(349));
+        assertThat(url7.end, is(352));
+
+        assertThat(input.substring(url7.start, url7.end), is("IGC"));
+
+        OffsetPosition url8 = offsetPositions.get(8);
+        assertThat(url8.start, is(397));
+        assertThat(url8.end, is(400));
+
+        assertThat(input.substring(url8.start, url8.end), is("IGC"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception {
+        final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter.";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> annotationTokenPositions = Arrays.asList(
+            new OffsetPosition(4, 7),
+            new OffsetPosition(40, 41),
+            new OffsetPosition(62, 63),
+            new OffsetPosition(79, 84)
+        );
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+        assertThat(offsetPositions, hasSize(4));
+
+        OffsetPosition url7 = offsetPositions.get(1);
+        assertThat(input.substring(url7.start, url7.end), is("NERC"));
+
+        OffsetPosition url8 = offsetPositions.get(2);
+        assertThat(input.substring(url8.start, url8.end), is("ERC"));
+    }
 }

From f983f2548813a02e9dc1a0b37e5fefe1eafd4abb Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 9 May 2024 12:17:59 +0900
Subject: [PATCH 13/14] Add additional test and fix to the method so that the
 offsets are correctly matching the real text (dehypenised)

---
 .../grobid/core/document/TEIFormatter.java    |  2 +-
 .../org/grobid/core/lexicon/LexiconTest.java  | 36 +++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 20a7746388..7283a2e513 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1873,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
-        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations);
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
         forbiddenPositions.addAll(offsetPositionsUrls);
 
         List<OffsetPosition> theSentences = 
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index c70c930435..8b3b501488 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -208,4 +208,40 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         OffsetPosition url = offsetPositions.get(0);
         assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
     }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
+            "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
+            "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
+            "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
+            "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
+            "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
+        lastTokenOfTheURL.setPage(19);
+        lastTokenOfTheURL.setX(465.54675000000003);
+        lastTokenOfTheURL.setY(404.908);
+        lastTokenOfTheURL.setWidth(68.727);
+        lastTokenOfTheURL.setHeight(9.0873);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(19);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("http://www.gencodegenes.org/releases/");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38.  ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
+    }
 }

From 617aa16a29ccd578c5734d42c1a92fdfce01b811 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 9 May 2024 17:11:07 +0900
Subject: [PATCH 14/14] Apply url preservation also in tables description and
 notes

---
 grobid-core/src/main/java/org/grobid/core/data/Table.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
index 6356978837..14d468418c 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Table.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 
                         // a <div>, then a <p>
@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     if (noteNode != null && config.isWithSentenceSegmentation()) {
                         // we need a sentence segmentation of the figure caption
-                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                     }
 
                     // enclose note content in a <p> element