kermitt2 · lfoppiano · Jun 9, 2024 · Apr 8, 2024 · Apr 11, 2024 · Apr 11, 2024
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -432,7 +432,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
             }
 
             if (desc != null && config.isWithSentenceSegmentation()) {
-                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                 // we need a sentence segmentation of the figure caption, for that we need to introduce 
                 // a <div>, then a <p>

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 
                         // a <div>, then a <p>
@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     if (noteNode != null && config.isWithSentenceSegmentation()) {
                         // we need a sentence segmentation of the figure caption
-                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                     }
 
                     // enclose note content in a <p> element 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -28,6 +28,7 @@
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.lang.Language;
 import org.grobid.core.layout.*;
+import org.grobid.core.lexicon.Lexicon;
 import org.grobid.core.utilities.SentenceUtilities;
 import org.grobid.core.tokenization.TaggingTokenCluster;
 import org.grobid.core.tokenization.TaggingTokenClusteror;
@@ -1320,7 +1321,7 @@ private StringBuilder toTEINote(StringBuilder tei,
 
 
             if (config.isWithSentenceSegmentation()) {
-                segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage());
+                segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
             }
 
             desc.appendChild(pNote);
@@ -1522,7 +1523,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
@@ -1550,7 +1551,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 } else {
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
@@ -1769,7 +1770,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
 
         // in case we segment paragraph into sentences, we still need to do it for the last paragraph 
         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
+            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
         }
 
         // remove possibly empty div in the div list
@@ -1836,6 +1837,10 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curP
     }
 
     public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang) {
+        segmentIntoSentences(curParagraph, curParagraphTokens, config, lang, new ArrayList<>());
+    }
+
+    public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang, List<PDFAnnotation> annotations) {
         // in order to avoid having a sentence boundary in the middle of a ref element 
         // (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
         // we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
@@ -1844,7 +1849,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // in xom, the following gives all the text under the element, for the whole subtree
         String text = curParagraph.getValue();
-        if (text == null || text.length() == 0)
+        if (StringUtils.isEmpty(text))
             return;
 
         // identify ref nodes, ref spans and ref positions
@@ -1861,8 +1866,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 // for readability in another conditional
                 if (((Element) theNode).getLocalName().equals("ref")) {
                     // map character offset of the node
-                    mapRefNodes.put(Integer.valueOf(pos), theNode);
-                    refPositions.add(Integer.valueOf(pos));
+                    mapRefNodes.put(pos, theNode);
+                    refPositions.add(pos);
 
                     String chunk = theNode.getValue();
                     forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length()));
@@ -1871,6 +1876,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
+        forbiddenPositions.addAll(offsetPositionsUrls);
+
         List<OffsetPosition> theSentences = 
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
 
@@ -1893,7 +1901,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
             for(int i=0; i<curParagraphTokens.size(); i++) {
                 LayoutToken token = curParagraphTokens.get(i);
-                if (token.getText() == null || token.getText().length() == 0) 
+                if (StringUtils.isEmpty(token.getText()))
                     continue;
                 int newPos = sentenceChunk.indexOf(token.getText(), pos);
                 if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -11,6 +11,7 @@
 
 import java.nio.charset.StandardCharsets;
 
+import org.apache.lucene.util.CollectionUtil;
 import org.grobid.core.GrobidModels;
 import org.grobid.core.data.*;
 import org.grobid.core.document.Document;
@@ -478,7 +479,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
         List<LayoutToken> currentChunk = new ArrayList<>();
         int currentPos = 0;
         for(LayoutToken token : tokens) {
-            if (currentChunk.size() != 0) {
+            if (CollectionUtils.isNotEmpty(currentChunk)) {
                 int tokenPos = token.getOffset();
                 if (currentPos != tokenPos) {
                     // new chunk
@@ -508,7 +509,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
             LayoutTokenization layouts = featSeg.getRight();
             if (layouts != null)
                 layoutTokenization = layouts.getTokenization();
-            if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
+            if (StringUtils.isNotBlank(featuredText)) {
                 res = label(featuredText);
                 res = postProcessFullTextLabeledText(res);
             }

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java
@@ -102,7 +102,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
     /**
      * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) 
      * will be processed in this segment and paragraph element will be replaced with the processed content.
-     * Resulting entities are relative to the whole procssed XML segment.
+     * Resulting entities are relative to the whole processed XML segment.
      * 
      * Tokenization is done with the default Grobid analyzer triggered by the identified language. 
      **/
@@ -178,7 +178,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
     /**
      * The processing here is called from the header and/or full text parser in cascade
      * when one of these higher-level model detect a "funding" section, or in case
-     * no funding section is found, when a acknolwedgements section is detected.
+     * no funding section is found, when an acknolwedgement section is detected.
      * 
      * Independently from the place this parser is called, it process the input sequence 
      * of layout tokens in a context free manner.