diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
index e9417e9217..ef4117e93c 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -432,7 +432,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
             }
 
             if (desc != null && config.isWithSentenceSegmentation()) {
-                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                 // we need a sentence segmentation of the figure caption, for that we need to introduce 
                 // a <div>, then a <p>
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
index 6356978837..14d468418c 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Table.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 
                         // a <div>, then a <p>
@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     if (noteNode != null && config.isWithSentenceSegmentation()) {
                         // we need a sentence segmentation of the figure caption
-                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                     }
 
                     // enclose note content in a <p> element 
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 7359577bcf..c50ee73c69 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -28,6 +28,7 @@
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.lang.Language;
 import org.grobid.core.layout.*;
+import org.grobid.core.lexicon.Lexicon;
 import org.grobid.core.utilities.SentenceUtilities;
 import org.grobid.core.tokenization.TaggingTokenCluster;
 import org.grobid.core.tokenization.TaggingTokenClusteror;
@@ -1320,7 +1321,7 @@ private StringBuilder toTEINote(StringBuilder tei,
 
 
             if (config.isWithSentenceSegmentation()) {
-                segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage());
+                segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
             }
 
             desc.appendChild(pNote);
@@ -1522,7 +1523,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
@@ -1550,7 +1551,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 } else {
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
@@ -1769,7 +1770,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
 
         // in case we segment paragraph into sentences, we still need to do it for the last paragraph 
         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
         }
 
         // remove possibly empty div in the div list
@@ -1836,6 +1837,10 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curP
     }
 
     public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang) {
+        segmentIntoSentences(curParagraph, curParagraphTokens, config, lang, new ArrayList<>());
+    }
+
+    public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang, List<PDFAnnotation> annotations) {
         // in order to avoid having a sentence boundary in the middle of a ref element 
         // (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
         // we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
@@ -1844,7 +1849,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // in xom, the following gives all the text under the element, for the whole subtree
         String text = curParagraph.getValue();
-        if (text == null || text.length() == 0)
+        if (StringUtils.isEmpty(text))
             return;
 
         // identify ref nodes, ref spans and ref positions
@@ -1861,8 +1866,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 // for readability in another conditional
                 if (((Element) theNode).getLocalName().equals("ref")) {
                     // map character offset of the node
-                    mapRefNodes.put(Integer.valueOf(pos), theNode);
-                    refPositions.add(Integer.valueOf(pos));
+                    mapRefNodes.put(pos, theNode);
+                    refPositions.add(pos);
 
                     String chunk = theNode.getValue();
                     forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length()));
@@ -1871,6 +1876,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
+        forbiddenPositions.addAll(offsetPositionsUrls);
+
         List<OffsetPosition> theSentences = 
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
     
@@ -1893,7 +1901,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
             for(int i=0; i<curParagraphTokens.size(); i++) {
                 LayoutToken token = curParagraphTokens.get(i);
-                if (token.getText() == null || token.getText().length() == 0) 
+                if (StringUtils.isEmpty(token.getText()))
                     continue;
                 int newPos = sentenceChunk.indexOf(token.getText(), pos);
                 if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
index 28eda7e693..953d92f8b1 100755
--- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -11,6 +11,7 @@
 
 import java.nio.charset.StandardCharsets;
 
+import org.apache.lucene.util.CollectionUtil;
 import org.grobid.core.GrobidModels;
 import org.grobid.core.data.*;
 import org.grobid.core.document.Document;
@@ -478,7 +479,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
         List<LayoutToken> currentChunk = new ArrayList<>();
         int currentPos = 0;
         for(LayoutToken token : tokens) {
-            if (currentChunk.size() != 0) {
+            if (CollectionUtils.isNotEmpty(currentChunk)) {
                 int tokenPos = token.getOffset();
                 if (currentPos != tokenPos) {
                     // new chunk
@@ -508,7 +509,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
             LayoutTokenization layouts = featSeg.getRight();
             if (layouts != null)
                 layoutTokenization = layouts.getTokenization();
-            if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
+            if (StringUtils.isNotBlank(featuredText)) {
                 res = label(featuredText);
                 res = postProcessFullTextLabeledText(res);
             }
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
index c92b270ff1..1068be3e28 100644
--- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
@@ -102,7 +102,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
     /**
      * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) 
      * will be processed in this segment and paragraph element will be replaced with the processed content.
-     * Resulting entities are relative to the whole procssed XML segment.
+     * Resulting entities are relative to the whole processed XML segment.
      * 
      * Tokenization is done with the default Grobid analyzer triggered by the identified language. 
      **/
@@ -178,7 +178,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
     /**
      * The processing here is called from the header and/or full text parser in cascade
      * when one of these higher-level model detect a "funding" section, or in case
-     * no funding section is found, when a acknolwedgements section is detected.
+     * no funding section is found, when an acknolwedgement section is detected.
      * 
      * Independently from the place this parser is called, it process the input sequence 
      * of layout tokens in a context free manner. 
diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index 271dd6fa77..681a0da7a7 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -20,7 +20,9 @@
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 
+import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.exceptions.GrobidResourceException;
 import org.grobid.core.lang.Language;
@@ -36,7 +38,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.commons.lang3.tuple.Pair;
+import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset;
 
 /**
  * Class for managing all the lexical resources.
@@ -101,19 +103,19 @@ private Lexicon() {
         initDictionary();
         initNames();
 		// the loading of the journal and conference names is lazy
-        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"wordforms"+File.separator+"english.wf", Language.EN);
-        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addDictionary(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"wordforms"+File.separator+"german.wf", Language.EN);
         addLastNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"names.family");
 		addLastNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"lastname.5k");
-        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"names.female");
-        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + 
+        addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"names.male");
-		addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + 
+		addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator +
 			"lexicon"+File.separator+"names"+File.separator+"firstname.5k");
         initCountryCodes();
         addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator +
@@ -465,33 +467,33 @@ public void initOrganisations() {
         try {
             organisationPattern = new FastMatcher(new
                     File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst"));
-			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + 
+			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() +
 				"/lexicon/organisations/government.government_agency"));
-			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + 
+			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() +
 				"/lexicon/organisations/known_corporations.lst"));
-			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + 
+			organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() +
 				"/lexicon/organisations/venture_capital.venture_funded_company"));
         } catch (PatternSyntaxException e) {
             throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e);
         } catch (IOException e) {
-            throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " + 
+            throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " +
 				"does not exist or cannot be read.", e);
         } catch (Exception e) {
 			throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
 		}
     }
-	
+
 	public void initOrgForms() {
         try {
 			orgFormPattern = new FastMatcher(new
-                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt"));	
+                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt"));
         } catch (PatternSyntaxException e) {
             throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e);
         } catch (Exception e) {
 			throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
 		}
     }
-	
+
 	public void initLocations() {
         try {
             locationPattern = new FastMatcher(new
@@ -522,8 +524,8 @@ public void initPersonSuffix() {
     public void initFunders() {
         try {
             funderPattern = new FastMatcher(new
-                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), 
-                    GrobidAnalyzer.getInstance(), true); 
+                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"),
+                    GrobidAnalyzer.getInstance(), true);
         } catch (PatternSyntaxException e) {
             throw new GrobidResourceException("Error when compiling lexicon matcher for funders.", e);
         } catch (Exception e) {
@@ -534,8 +536,8 @@ public void initFunders() {
     public void initResearchInfrastructures() {
         try {
             researchInfrastructurePattern = new FastMatcher(new
-                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), 
-                    GrobidAnalyzer.getInstance(), true); 
+                    File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"),
+                    GrobidAnalyzer.getInstance(), true);
             // store some name mapping
             researchOrganizations = new TreeMap<>();
 
@@ -563,7 +565,7 @@ public void initResearchInfrastructures() {
                     String[] pieces = line.split(";", -1); // -1 for getting empty tokens too
                     if (pieces.length == 3) {
                         if (pieces[0].length() > 0) {
-                            
+
                             if (pieces[1].length() > 0) {
                                 OrganizationRecord localInfra = new OrganizationRecord(pieces[0], pieces[1], "en");
                                 List<OrganizationRecord> localInfraList = researchOrganizations.get(pieces[0].toLowerCase());
@@ -608,7 +610,7 @@ public void initResearchInfrastructures() {
             throw new GrobidResourceException("Error when compiling lexicon matcher for research infrastructure.", e);
         } catch (Exception e) {
             throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
-        }    
+        }
     }
 
     /**
@@ -642,15 +644,15 @@ public boolean isPunctuation(String s) {
     public List<OrganizationRecord> getOrganizationNamingInfo(String name) {
         if (researchOrganizations == null)
             return null;
-        return researchOrganizations.get(name.toLowerCase()); 
+        return researchOrganizations.get(name.toLowerCase());
     }
 
     /**
      * Map the language codes used by the language identifier component to the normal
      * language name.
      *
-     * Note: due to an older bug, kr is currently map to Korean too - this should 
-     * disappear at some point in the future after retraining of models 
+     * Note: due to an older bug, kr is currently map to Korean too - this should
+     * disappear at some point in the future after retraining of models
      *
      * @param code the language to be mapped
      */
@@ -896,7 +898,7 @@ public List<OffsetPosition> charPositionsOrganisationNames(String s) {
 
     /**
      * Soft look-up in organisation names gazetteer for a tokenize sequence.
-     * It return a list of positions referring to the character positions within the input 
+     * It return a list of positions referring to the character positions within the input
      * sequence.
      *
      * @param s the input list of LayoutToken
@@ -987,7 +989,7 @@ public List<OffsetPosition> tokenPositionsLocationNames(List<LayoutToken> s) {
     }
 
     /**
-     * Soft look-up in location name gazetteer for a string, return a list of positions referring 
+     * Soft look-up in location name gazetteer for a string, return a list of positions referring
      * to the character positions within the string.
      *
      * For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19)
@@ -1004,7 +1006,7 @@ public List<OffsetPosition> charPositionsLocationNames(String s) {
     }
 
     /**
-     * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of 
+     * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of
      * positions referring to the character positions in the input sequence.
      *
      * For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19)
@@ -1092,7 +1094,7 @@ public List<OffsetPosition> charPositionsPersonTitle(List<LayoutToken> s) {
     public List<OffsetPosition> tokenPositionsIdentifierPattern(List<LayoutToken> tokens) {
         List<OffsetPosition> result = new ArrayList<OffsetPosition>();
         String text = LayoutTokensUtil.toText(tokens);
-        
+
         // DOI positions
         result = tokenPositionsDOIPattern(tokens, text);
 
@@ -1115,10 +1117,10 @@ public List<OffsetPosition> tokenPositionsIdentifierPattern(List<LayoutToken> to
     public List<OffsetPosition> tokenPositionsDOIPattern(List<LayoutToken> tokens, String text) {
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher doiMatcher = TextUtilities.DOIPattern.matcher(text);
-        while (doiMatcher.find()) {            
+        while (doiMatcher.find()) {
             textResult.add(new OffsetPosition(doiMatcher.start(), doiMatcher.end()));
         }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
     /**
@@ -1128,11 +1130,11 @@ public List<OffsetPosition> tokenPositionsDOIPattern(List<LayoutToken> tokens, S
     public List<OffsetPosition> tokenPositionsArXivPattern(List<LayoutToken> tokens, String text) {
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher arXivMatcher = TextUtilities.arXivPattern.matcher(text);
-        while (arXivMatcher.find()) {  
+        while (arXivMatcher.find()) {
             //System.out.println(arXivMatcher.start() + " / " + arXivMatcher.end() + " / " + text.substring(arXivMatcher.start(), arXivMatcher.end()));                 
             textResult.add(new OffsetPosition(arXivMatcher.start(), arXivMatcher.end()));
         }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
 
@@ -1141,7 +1143,7 @@ public List<OffsetPosition> tokenPositionsArXivPattern(List<LayoutToken> tokens,
      */
     public List<OffsetPosition> tokenPositionsISSNPattern(List<LayoutToken> tokens) {
         List<OffsetPosition> result = new ArrayList<OffsetPosition>();
-        
+
         // TBD !
 
         return result;
@@ -1161,50 +1163,71 @@ public List<OffsetPosition> tokenPositionsISBNPattern(List<LayoutToken> tokens)
     /**
      * Identify in tokenized input the positions of an URL pattern with token positions
      */
-    public List<OffsetPosition> tokenPositionsUrlPattern(List<LayoutToken> tokens) {
-        //List<OffsetPosition> result = new ArrayList<OffsetPosition>();
-        String text = LayoutTokensUtil.toText(tokens);
-        List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
-        Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
-        while (urlMatcher.find()) {  
-            //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end()));                 
-            textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
-        }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+    public static List<OffsetPosition> tokenPositionsUrlPattern(List<LayoutToken> tokens) {
+        List<OffsetPosition> textResult = characterPositionsUrlPattern(tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
     /**
      * Identify in tokenized input the positions of an URL pattern with character positions
      */
-    public List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken> tokens) {
-        //List<OffsetPosition> result = new ArrayList<OffsetPosition>();
+    public static List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken> tokens) {
         String text = LayoutTokensUtil.toText(tokens);
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
-        while (urlMatcher.find()) {  
+        while (urlMatcher.find()) {
             textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
         }
         return textResult;
     }
 
     /**
-     * Identify in tokenized input the positions of an URL pattern with character positions, 
+     * Identify in tokenized input the positions of a URL pattern with character positions,
      * and refine positions based on possible PDF URI annotations.
-     * 
+     *
      * This will produce better quality recognized URL, avoiding missing suffixes and problems
      * with break lines and spaces.
      **/
     public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotations(
-                                    List<LayoutToken> layoutTokens, 
-                                    List<PDFAnnotation> pdfAnnotations, 
-                                    String text) {
-        List<OffsetPosition> urlPositions = Lexicon.getInstance().characterPositionsUrlPattern(layoutTokens);
+        List<LayoutToken> layoutTokens,
+        List<PDFAnnotation> pdfAnnotations,
+        String text) {
+
+        List<OffsetPosition> urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
+
+        // We need to adjust the end of the positions to avoid problems with the sublist
+        // that is used the following method
+        urlTokensPositions.stream().forEach(o -> o.end += 1);
+
+        // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
+        // which may be different (spaces, hypen, breakline)
+        return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions);
+    }
+
+    /**
+     * This method returns the token positions in respect of the layout tokens,
+     * the output token offsets are (included, included)
+     */
+    public static List<OffsetPosition> tokenPositionUrlPatternWithPdfAnnotations(
+        List<LayoutToken> layoutTokens,
+        List<PDFAnnotation> pdfAnnotations) {
+
+        return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
+    }
+
+    /**
+     * This method returns the character offsets in relation to the string obtained by the layout tokens.
+     * Notice the absence of the String text parameter.
+     */
+    public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotations(
+                                    List<LayoutToken> layoutTokens,
+                                    List<PDFAnnotation> pdfAnnotations) {
+        List<OffsetPosition> urlPositions = Lexicon.characterPositionsUrlPattern(layoutTokens);
         List<OffsetPosition> resultPositions = new ArrayList<>();
 
         // do we need to extend the url position based on additional position of the corresponding 
         // PDF annotation?
         for(OffsetPosition urlPosition : urlPositions) {
-
             int startPos = urlPosition.start;
             int endPos = urlPosition.end;
 
@@ -1230,11 +1253,10 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                 tokenIndex++;
             }
 
-            //String urlString = LayoutTokensUtil.toText(urlTokens);
-            String urlString = text.substring(startPos, endPos);
+            String urlString = LayoutTokensUtil.toText(urlTokens);
 
             PDFAnnotation targetAnnotation = null;
-            if (urlTokens.size()>0) {
+            if (CollectionUtils.isNotEmpty(urlTokens)) {
                 LayoutToken lastToken = urlTokens.get(urlTokens.size()-1);
                 if (pdfAnnotations != null) {
                     for (PDFAnnotation pdfAnnotation : pdfAnnotations) {
@@ -1253,7 +1275,7 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                 String destination = targetAnnotation.getDestination();
 
                 int destinationPos = 0;
-                if (destination.indexOf(urlString) != -1) {
+                if (destination.contains(urlString)) {
                     destinationPos = destination.indexOf(urlString)+urlString.length();
                 }
 
@@ -1261,7 +1283,7 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                     for(int j=endTokensIndex+1; j<layoutTokens.size(); j++) {
                         LayoutToken nextToken = layoutTokens.get(j);
 
-                        if ("\n".equals(nextToken.getText()) || 
+                        if ("\n".equals(nextToken.getText()) ||
                             " ".equals(nextToken.getText()) ||
                             nextToken.getText().length() == 0) {
                             endPos += nextToken.getText().length();
@@ -1274,15 +1296,16 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                             endPos += nextToken.getText().length();
                             destinationPos = pos + nextToken.getText().length();
                             urlTokens.add(nextToken);
-                        } else 
+                        } else
                             break;
                     }
                 }
             }
 
             // finally avoid ending a URL by a dot, because it can harm the sentence segmentation
-            if (text.charAt(endPos-1) == '.') 
-                endPos = endPos-1;
+            if (StringUtils.substring(LayoutTokensUtil.toText(layoutTokens), startPos, endPos).endsWith(".")) {
+                endPos = endPos - 1;
+            }
 
             OffsetPosition position = new OffsetPosition();
             position.start = startPos;
@@ -1303,11 +1326,11 @@ public List<OffsetPosition> tokenPositionsEmailPattern(List<LayoutToken> tokens)
             return new ArrayList<OffsetPosition>();
         List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
         Matcher emailMatcher = TextUtilities.emailPattern.matcher(text);
-        while (emailMatcher.find()) {  
+        while (emailMatcher.find()) {
             //System.out.println(urlMatcher.start() + " / " + urlMatcher.end() + " / " + text.substring(urlMatcher.start(), urlMatcher.end()));                 
             textResult.add(new OffsetPosition(emailMatcher.start(), emailMatcher.end()));
         }
-        return Utilities.convertStringOffsetToTokenOffset(textResult, tokens);
+        return convertStringOffsetToTokenOffset(textResult, tokens);
     }
 
 }
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
index 06f69bcdee..f0e6cf03af 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -1556,4 +1556,59 @@ public static org.apache.commons.lang3.tuple.Pair<OffsetPosition, OffsetPosition
         else
             return null;
     }
+
+    public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> positions) {
+        List<OffsetPosition> newPositions = new ArrayList<>();
+        StringBuilder accumulator = new StringBuilder();
+        int pos = 0;
+        int textPositionOfToken = 0;
+
+        for (OffsetPosition position : positions) {
+            List<LayoutToken> annotationTokens = layoutTokens.subList(position.start, position.end);
+            boolean first = true;
+            accumulator = new StringBuilder();
+            for (int i = 0; i < annotationTokens.size(); i++) {
+                LayoutToken token = annotationTokens.get(i);
+                if (StringUtils.isEmpty(token.getText()))
+                    continue;
+                textPositionOfToken = text.indexOf(token.getText(), pos);
+                if (textPositionOfToken != -1) {
+                    //We update pos only at the first token of the annotation positions
+                    if (first) {
+                        pos = textPositionOfToken;
+                        first = false;
+                    }
+                    accumulator.append(token);
+                } else {
+                    if (SentenceUtilities.toSkipToken(token.getText())) {
+                        continue;
+                    }
+                    if (StringUtils.isNotEmpty(accumulator)) {
+                        int accumulatorTextLength = accumulator.toString().length();
+                        int start = text.indexOf(accumulator.toString(), pos);
+                        int end = start + accumulatorTextLength;
+                        newPositions.add(new OffsetPosition(start, end));
+                        pos = end;
+                        break;
+                    }
+                    pos = textPositionOfToken;
+                }
+            }
+            if (StringUtils.isNotEmpty(accumulator)) {
+                int annotationTextLength = accumulator.toString().length();
+                int start = text.indexOf(accumulator.toString(), pos);
+                int end = start + annotationTextLength;
+                newPositions.add(new OffsetPosition(start, end));
+                pos = end;
+                accumulator = new StringBuilder();
+            }
+
+        }
+        if (StringUtils.isNotEmpty(accumulator)) {
+            int start = text.indexOf(accumulator.toString(), pos);
+            newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
+        }
+
+        return newPositions;
+    }
 }
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
index de88ab3aa4..d4a838a5df 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java
@@ -504,7 +504,7 @@ public static List<OffsetPosition> convertStringOffsetToTokenOffsetOld(
 	}
 
 	/**
-	 * This version uses actual LayoutToken offsets relative to the tokens present in argment only.
+	 * This version uses actual LayoutToken offsets relative to the tokens present in argument only.
 	 * It supposes that the stringPosition have been identified on the provided tokens only, and not 
 	 * restricted to the complete document.
 	 */
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
index 620f01a73a..2d888520ec 100755
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
@@ -1,13 +1,16 @@
 package org.grobid.core.lexicon;
 
+import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.layout.PDFAnnotation;
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.LayoutTokensUtil;
 import org.grobid.core.layout.LayoutToken;
-import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.Test;
 
+import java.util.ArrayList;
 import java.util.List;
 
 import static org.hamcrest.CoreMatchers.is;
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
new file mode 100644
index 0000000000..8b3b501488
--- /dev/null
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -0,0 +1,247 @@
+package org.grobid.core.lexicon;
+
+import org.apache.commons.lang3.StringUtils;
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.layout.LayoutToken;
+import org.grobid.core.layout.PDFAnnotation;
+import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.OffsetPosition;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasSize;
+
+public class LexiconTest {
+    @Test
+    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "This work was distributed on http:// github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        assertThat(input.substring(FirstURL.start, FirstURL.end), is("http:// github.com/myUsername/MyProject"));
+    }
+
+    @Test
+    public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "This work was distributed on http:// github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.tokenPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        //Note: The intervals returned by the method Utilities.convertStringOffsetToTokenOffset
+        // consider the upper index to be included, while java consider the upper index to be excluded
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject"));
+    }
+
+    @Test
+    @Ignore("This test will fail, it can be used to test a real case when updating the regular exception")
+    public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
+        final String input = "720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(input.substring(url.start, url.end), is("https://faolex.fao.org/ docs/pdf/lao144178.pdf"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval() throws Exception {
+        final String input = "1. 'internal status' indicates that their records should be \n" +
+            "hidden in the interface. \n" +
+            "2. In our previous work [1] we reported 77.03% F1-\n" +
+            "score. There is a slight decrease in absolute scores \n" +
+            "between DeLFT 0.2.8 and DeLFT 0.3.0. One cause \n" +
+            "may be the use of different hyperparameters in \n" +
+            "version 0.3.0 such as batch size and learning rate. \n" +
+            "However, the most probable cause could be the \n" +
+            "impact of using the Huggingface tokenizers \n" +
+            "library which is suffering from quality issues \n" +
+            "https://github.com/kermitt2/delft/issues/150. \n" +
+            "\n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150. ";
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(10);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 84.30, 706.68, 177.39, 9.52));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/kermitt2/delft/issues/150");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(input.substring(url.start, url.end), is("https://github.com/lfoppiano/ \nsupercon2"));
+    }
+
+    @Test
+    public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+        List<OffsetPosition> offsetPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(19);
+        lastTokenOfTheURL.setPage(9);
+        lastTokenOfTheURL.setX(530.9363448275863);
+        lastTokenOfTheURL.setY(538.153);
+        lastTokenOfTheURL.setWidth(4.363655172413793);
+        lastTokenOfTheURL.setHeight(9.702);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(9);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(9, 408.76, 537.11, 126.54, 10.49));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("https://github.com/lfoppiano/supercon2");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
+            "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
+            "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
+            "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
+            "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
+            "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
+        lastTokenOfTheURL.setPage(19);
+        lastTokenOfTheURL.setX(465.54675000000003);
+        lastTokenOfTheURL.setY(404.908);
+        lastTokenOfTheURL.setWidth(68.727);
+        lastTokenOfTheURL.setHeight(9.0873);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(19);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("http://www.gencodegenes.org/releases/");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38.  ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
+    }
+}
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
index ff5ac7467b..8b53cc263e 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
@@ -8,12 +8,13 @@
 import org.junit.Test;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.regex.Matcher;
 
 import static org.hamcrest.CoreMatchers.is;
-import static org.hamcrest.CoreMatchers.startsWith;
 import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.*;
 
 
@@ -407,4 +408,166 @@ public void testOrcidPattern() {
             }
         }
     }
+
+    @Test
+    public void testMatchTokenAndString() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+        List<OffsetPosition> urlTokens = Arrays.asList(new OffsetPosition(10, 23));
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url1 = offsetPositions.get(0);
+        assertThat(url1.start, is(26));
+        assertThat(url1.end, is(65));
+        assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+    }
+
+
+    @Test
+    public void testMatchTokenAndString_twoElements() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+        List<OffsetPosition> urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23));
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+        assertThat(offsetPositions, hasSize(2));
+        OffsetPosition url0 = offsetPositions.get(0);
+        assertThat(url0.start, is(0));
+        assertThat(url0.end, is(9));
+
+        assertThat(inputReal.substring(url0.start, url0.end), is("This work"));
+
+        OffsetPosition url1 = offsetPositions.get(1);
+        assertThat(url1.start, is(26));
+        assertThat(url1.end, is(65));
+
+        assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception {
+        final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> urlTokens = Arrays.asList(
+            new OffsetPosition(0, 3),
+            new OffsetPosition(5, 8),
+            new OffsetPosition(10, 13),
+            new OffsetPosition(15, 18)
+        );
+        
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens);
+
+        assertThat(offsetPositions, hasSize(4));
+        
+        OffsetPosition url0 = offsetPositions.get(0);
+        assertThat(url0.start, is(0));
+        assertThat(url0.end, is(19));
+
+        assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne"));
+
+        OffsetPosition url1 = offsetPositions.get(1);
+        assertThat(url1.start, is(21));
+        assertThat(url1.end, is(34));
+
+        assertThat(input.substring(url1.start, url1.end), is("Claudie Marec"));
+
+        OffsetPosition url2 = offsetPositions.get(2);
+        assertThat(url2.start, is(36));
+        assertThat(url2.end, is(49));
+
+        assertThat(input.substring(url2.start, url2.end), is("Claudie Marec"));
+
+        OffsetPosition url3 = offsetPositions.get(3);
+        assertThat(url3.start, is(51));
+        assertThat(url3.end, is(66));
+
+        assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception {
+        final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities.";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> annotationTokenPositions = Arrays.asList(
+            new OffsetPosition(4, 7),
+            new OffsetPosition(9, 12),
+            new OffsetPosition(15, 18),
+            new OffsetPosition(27, 30),
+            new OffsetPosition(49, 52),
+            new OffsetPosition(71, 74),
+            new OffsetPosition(103, 106),
+            new OffsetPosition(109, 110),
+            new OffsetPosition(125, 126)
+        );
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+        assertThat(offsetPositions, hasSize(9));
+
+        OffsetPosition url7 = offsetPositions.get(7);
+        assertThat(url7.start, is(349));
+        assertThat(url7.end, is(352));
+
+        assertThat(input.substring(url7.start, url7.end), is("IGC"));
+
+        OffsetPosition url8 = offsetPositions.get(8);
+        assertThat(url8.start, is(397));
+        assertThat(url8.end, is(400));
+
+        assertThat(input.substring(url8.start, url8.end), is("IGC"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception {
+        final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter.";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> annotationTokenPositions = Arrays.asList(
+            new OffsetPosition(4, 7),
+            new OffsetPosition(40, 41),
+            new OffsetPosition(62, 63),
+            new OffsetPosition(79, 84)
+        );
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+        assertThat(offsetPositions, hasSize(4));
+
+        OffsetPosition url7 = offsetPositions.get(1);
+        assertThat(input.substring(url7.start, url7.end), is("NERC"));
+
+        OffsetPosition url8 = offsetPositions.get(2);
+        assertThat(input.substring(url8.start, url8.end), is("ERC"));
+    }
 }
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
index 64d6d4be7a..9e5a6958ff 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/UtilitiesTest.java
@@ -3,10 +3,17 @@
 import java.io.File;
 import java.io.IOException;
 
+import java.util.Arrays;
 import java.util.List;
 import java.util.ArrayList;
 
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.layout.LayoutToken;
 import org.junit.Test;
+
+import static org.grobid.core.utilities.Utilities.convertStringOffsetToTokenOffset;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.*;
 
 public class UtilitiesTest {
@@ -112,4 +119,16 @@ public void testMergePositionsOverlap() throws IOException {
 		assertEquals(positions.get(1).start, 7);		
 		assertEquals(positions.get(1).end, 10);
 	}
+
+    @Test
+    public void testConvertStringOffsetToTokenOffset() throws Exception {
+        String input = "This is a token.";
+        List<LayoutToken> layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        OffsetPosition stringPosition = new OffsetPosition(5, 9);
+        List<OffsetPosition> tokenOffsets = convertStringOffsetToTokenOffset(Arrays.asList(stringPosition), layoutTokens);
+
+        assertThat(tokenOffsets, hasSize(1));
+        OffsetPosition position = tokenOffsets.get(0);
+        assertThat(LayoutTokensUtil.toText(layoutTokens.subList(position.start, position.end + 1)), is("is a"));
+    }
 }