Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix OOBE when processing large quantities of notes #1075

Merged
merged 3 commits into from
Jan 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .github/workflows/ci-build-unstable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,25 @@ jobs:
format: jacoco
file: build/reports/jacoco/codeCoverageReport/codeCoverageReport.xml
debug: true

docker-build:
needs: [ build ]
runs-on: ubuntu-latest

steps:
- name: Create more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v2
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v5
with:
username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }}
password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }}
image: lfoppiano/grobid
registry: docker.io
pushImage: ${{ github.event_name != 'pull_request' }}
tags: latest-develop
dockerfile: Dockerfile.crf
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
7 changes: 6 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Note.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.Page;
import org.grobid.core.utilities.*;
Expand Down Expand Up @@ -74,7 +75,11 @@ public void setOffsetStartInPage(int offsetStartInPage) {
}

public int getPageNumber() {
return tokens.get(0).getPage();
if (CollectionUtils.isNotEmpty(tokens))
return tokens.get(0).getPage();
else {
return -1;
}
}

public String getText() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,6 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
}

String footText = doc.getDocumentPieceText(docPiece);
footText = TextUtilities.dehyphenize(footText);
footText = footText.replace("\n", " ");
//footText = footText.replace(" ", " ").trim();
if (footText.length() < 6)
Expand All @@ -1071,6 +1070,9 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
if (localNotes != null)
notes.addAll(localNotes);
}

notes.stream()
.forEach(n -> n.setText(TextUtilities.dehyphenize(n.getText())));

return notes;
}
Expand All @@ -1089,7 +1091,7 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
String groupStr = ma.group(1);
footText = ma.group(2);
try {
if (groupStr.indexOf(".") != -1)
if (groupStr.contains("."))
sugarText = ".";
String groupStrNormalized = groupStr.replace(".", "");
groupStrNormalized = groupStrNormalized.trim();
Expand All @@ -1109,11 +1111,12 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
} else
break;

if (toConsume.length() == 0)
if (StringUtils.isEmpty(toConsume))
break;
}
if (start != 0)
if (start != 0) {
noteTokens = noteTokens.subList(start, noteTokens.size());
}
}
} catch (NumberFormatException e) {
currentNumber = -1;
Expand All @@ -1130,38 +1133,38 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No

// add possible subsequent notes concatenated in the same note sequence (this is a common error,
// which is addressed here by heuristics, it may not be necessary in the future with a better
// segmentation model using more foot notes training data)
// segmentation model using more footnotes training data)
if (currentNumber != -1) {
String nextLabel = " " + (currentNumber+1);
// suger characters after note number must be consistent with the previous ones to avoid false match
// sugar characters after note number must be consistent with the previous ones to avoid false match
if (sugarText != null)
nextLabel += sugarText;

int ind = footText.indexOf(nextLabel);
if (ind != -1) {
int nextFootnoteLabelIndex = footText.indexOf(nextLabel);
if (nextFootnoteLabelIndex != -1) {
// optionally we could restrict here to superscript numbers
// review local note
localNote.setText(footText.substring(0, ind));
localNote.setText(footText.substring(0, nextFootnoteLabelIndex));
int pos = 0;
List<LayoutToken> previousNoteTokens = new ArrayList<>();
List<LayoutToken> nextNoteTokens = new ArrayList<>();
for(LayoutToken localToken : noteTokens) {
if (localToken.getText() == null || localToken.getText().length() == 0)
if (StringUtils.isEmpty(localToken.getText()))
continue;
pos += localToken.getText().length();
if (pos <= ind+1) {
if (pos <= nextFootnoteLabelIndex+1) {
previousNoteTokens.add(localToken);
} else {
nextNoteTokens.add(localToken);
}
}
localNote.setTokens(previousNoteTokens);
String nextFootText = footText.substring(ind+1, footText.length());
String nextFootText = footText.substring(nextFootnoteLabelIndex+1);

// process the concatenated note
if (nextNoteTokens.size() >0 && nextFootText.length()>0) {
if (CollectionUtils.isNotEmpty(nextNoteTokens) && StringUtils.isNotEmpty(nextFootText)) {
List<Note> nextNotes = makeNotes(nextNoteTokens, nextFootText, noteType, notes.size());
if (nextNotes != null && nextNotes.size()>0)
if (CollectionUtils.isNotEmpty(nextNotes))
notes.addAll(nextNotes);
}
}
Expand Down Expand Up @@ -1460,7 +1463,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
int clusterPage = Iterables.getLast(clusterTokens).getPage();

List<Note> notesSamePage = null;
if (notes != null && notes.size() > 0) {
if (CollectionUtils.isNotEmpty(notes)) {
notesSamePage = notes.stream()
.filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage)
.collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import java.util.List;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;

public class TEIFormatterTest {
Expand All @@ -35,6 +37,30 @@ public void testMakeFootNote() throws Exception {
assertThat(LayoutTokensUtil.toText(footnote.getTokens()), is("This is a footnote"));
assertThat(footnote.getLabel(), is("1"));
}


@Test
public void testMakeNotes() throws Exception {
String text = "198 U.S. Const. art. I, § §9 & 10. \n199 To be sure, there are revisionist arguments that the Ex Post Facto clause itself extends to retroactive civil laws too. See Eastern Enterprises v. Apfel, 524 U.S. 498, 538-39 (1998) (Thomas, J., concurring). And as with bills of attainder, in the wake of the Civil War the Supreme Court held that Ironclad Oath requirements were ex post facto laws as well. Cummings, 71 U.S. at 326-332; Garland, 71 U.S. at 377-368. But as discussed in the text, even these principles do not ensnare Section Three going forward, on a non-ex-post-facto basis \n200 3 U.S. at 378-80 (arguments of counsel). \n201 Id. \n202 Id. at 382. See Baude & Sachs, Eleventh Amendment, supra note 9, at 626-627. Electronic copy available at: https://ssrn.com/abstract=4532751";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
text = text.replace("\n", " ");
tokens.stream().forEach(t-> t.setOffset(t.getOffset() + 403));
List<Note> footnotes = new TEIFormatter(null, null)
.makeNotes(tokens, text, Note.NoteType.FOOT, 37);

assertThat(footnotes, hasSize(5));
assertThat(footnotes.get(0).getLabel(), is("198"));
assertThat(footnotes.get(0).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(1).getLabel(), is("199"));
assertThat(footnotes.get(1).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(2).getLabel(), is("200"));
assertThat(footnotes.get(2).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(3).getLabel(), is("201"));
assertThat(footnotes.get(3).getText(), is("Id. "));
assertThat(footnotes.get(3).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(4).getLabel(), is("202"));
assertThat(footnotes.get(4).getTokens(), hasSize(greaterThan(0)));
}



Expand Down
Loading