Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid splitting URLs between sentences #1097

Merged
merged 16 commits into from
Jun 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (desc != null && config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the figure caption, for that we need to introduce
// a <div>, then a <p>
Expand Down
4 changes: 2 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (desc != null && config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the table caption, for that we need to introduce
// a <div>, then a <p>
Expand Down Expand Up @@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

if (noteNode != null && config.isWithSentenceSegmentation()) {
// we need a sentence segmentation of the figure caption
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}

// enclose note content in a <p> element
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.lang.Language;
import org.grobid.core.layout.*;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.utilities.SentenceUtilities;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
Expand Down Expand Up @@ -1320,7 +1321,7 @@ private StringBuilder toTEINote(StringBuilder tei,


if (config.isWithSentenceSegmentation()) {
segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage());
segmentIntoSentences(pNote, noteTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}

desc.appendChild(pNote);
Expand Down Expand Up @@ -1522,7 +1523,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
Expand Down Expand Up @@ -1550,7 +1551,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
} else {
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
Expand Down Expand Up @@ -1769,7 +1770,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,

// in case we segment paragraph into sentences, we still need to do it for the last paragraph
if (curParagraph != null && config.isWithSentenceSegmentation()) {
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}

// remove possibly empty div in the div list
Expand Down Expand Up @@ -1836,6 +1837,10 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curP
}

public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang) {
segmentIntoSentences(curParagraph, curParagraphTokens, config, lang, new ArrayList<>());
}

public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang, List<PDFAnnotation> annotations) {
// in order to avoid having a sentence boundary in the middle of a ref element
// (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
// we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
Expand All @@ -1844,7 +1849,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara

// in xom, the following gives all the text under the element, for the whole subtree
String text = curParagraph.getValue();
if (text == null || text.length() == 0)
if (StringUtils.isEmpty(text))
return;

// identify ref nodes, ref spans and ref positions
Expand All @@ -1861,8 +1866,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
// for readability in another conditional
if (((Element) theNode).getLocalName().equals("ref")) {
// map character offset of the node
mapRefNodes.put(Integer.valueOf(pos), theNode);
refPositions.add(Integer.valueOf(pos));
mapRefNodes.put(pos, theNode);
refPositions.add(pos);

String chunk = theNode.getValue();
forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length()));
Expand All @@ -1871,6 +1876,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}
}

List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));

Expand All @@ -1893,7 +1901,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara

for(int i=0; i<curParagraphTokens.size(); i++) {
LayoutToken token = curParagraphTokens.get(i);
if (token.getText() == null || token.getText().length() == 0)
if (StringUtils.isEmpty(token.getText()))
continue;
int newPos = sentenceChunk.indexOf(token.getText(), pos);
if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import java.nio.charset.StandardCharsets;

import org.apache.lucene.util.CollectionUtil;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.*;
import org.grobid.core.document.Document;
Expand Down Expand Up @@ -478,7 +479,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
List<LayoutToken> currentChunk = new ArrayList<>();
int currentPos = 0;
for(LayoutToken token : tokens) {
if (currentChunk.size() != 0) {
if (CollectionUtils.isNotEmpty(currentChunk)) {
int tokenPos = token.getOffset();
if (currentPos != tokenPos) {
// new chunk
Expand Down Expand Up @@ -508,7 +509,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
LayoutTokenization layouts = featSeg.getRight();
if (layouts != null)
layoutTokenization = layouts.getTokenization();
if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
if (StringUtils.isNotBlank(featuredText)) {
res = label(featuredText);
res = postProcessFullTextLabeledText(res);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
/**
* For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p)
* will be processed in this segment and paragraph element will be replaced with the processed content.
* Resulting entities are relative to the whole procssed XML segment.
* Resulting entities are relative to the whole processed XML segment.
*
* Tokenization is done with the default Grobid analyzer triggered by the identified language.
**/
Expand Down Expand Up @@ -178,7 +178,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
/**
* The processing here is called from the header and/or full text parser in cascade
* when one of these higher-level model detect a "funding" section, or in case
* no funding section is found, when a acknolwedgements section is detected.
* no funding section is found, when an acknolwedgement section is detected.
*
* Independently from the place this parser is called, it process the input sequence
* of layout tokens in a context free manner.
Expand Down
Loading
Loading