Improve pdf content parser for DOIs (#11782)

* Improve pdf content parser for DOIs * changelog * try to fix checkstyle * reorder * fix * fuu checstyle * Modernize test java code Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> * Add missing dot Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> --------- Co-authored-by: Oliver Kopp <kopp.dev@gmail.com> Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com>
JabRef · Sep 18, 2024 · 604896c · 604896c
1 parent 88fc846
commit 604896c
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 75 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -48,6 +48,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
 - The browse button for a Custom exporter now opens in the directory of the current used exporter file. [#11717](https://github.com/JabRef/jabref/pull/11717)
 - We improved the display of long messages in the integrity check dialog. [#11619](https://github.com/JabRef/jabref/pull/11619)
 - We improved the undo/redo buttons in the main toolbar and main menu to be disabled when there is nothing to undo/redo. [#8807](https://github.com/JabRef/jabref/issues/8807)
+- We improved the DOI detection in PDF imports. [#11782](https://github.com/JabRef/jabref/pull/11782)
 
 ### Fixed
 

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -22,6 +22,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.LinkedFile;
 import org.jabref.model.entry.field.StandardField;
+import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.types.EntryType;
 import org.jabref.model.entry.types.StandardEntryType;
 import org.jabref.model.strings.StringUtil;
@@ -241,7 +242,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         String keywords = null;
         String title;
         String conference = null;
-        String DOI = null;
+        String doi = null;
         String series = null;
         String volume = null;
         String number = null;
@@ -253,6 +254,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         if (curString.length() > 4) {
             // special case: possibly conference as first line on the page
             extractYear();
+            doi = getDoi(null);
             if (curString.contains("Conference")) {
                 fillCurStringWithNonEmptyLines();
                 conference = curString;
@@ -384,27 +386,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                     }
                 }
             } else {
-                if (DOI == null) {
-                    pos = curString.indexOf("DOI");
-                    if (pos < 0) {
-                        pos = curString.indexOf(StandardField.DOI.getName());
-                    }
-                    if (pos >= 0) {
-                        pos += 3;
-                        if (curString.length() > pos) {
-                            char delimiter = curString.charAt(pos);
-                            if ((delimiter == ':') || (delimiter == ' ')) {
-                                pos++;
-                            }
-                            int nextSpace = curString.indexOf(' ', pos);
-                            if (nextSpace > 0) {
-                                DOI = curString.substring(pos, nextSpace);
-                            } else {
-                                DOI = curString.substring(pos);
-                            }
-                        }
-                    }
-                }
+                doi = getDoi(doi);
 
                 if ((publisher == null) && curString.contains("IEEE")) {
                     // IEEE has the conference things at the end
@@ -459,8 +441,8 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         if (conference != null) {
             entry.setField(StandardField.BOOKTITLE, conference);
         }
-        if (DOI != null) {
-            entry.setField(StandardField.DOI, DOI);
+        if (doi != null) {
+            entry.setField(StandardField.DOI, doi);
         }
         if (series != null) {
             entry.setField(StandardField.SERIES, series);
@@ -483,6 +465,20 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         return Optional.of(entry);
     }
 
+    private String getDoi(String doi) {
+        int pos;
+        if (doi == null) {
+            pos = curString.indexOf("DOI");
+            if (pos < 0) {
+                pos = curString.indexOf(StandardField.DOI.getName());
+            }
+            if (pos >= 0) {
+                return DOI.findInText(curString).map(DOI::getDOI).orElse(null);
+            }
+        }
+        return doi;
+    }
+
     private String getFirstPageContents(PDDocument document) throws IOException {
         PDFTextStripper stripper = new PDFTextStripper();
 

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -1,7 +1,6 @@
 package org.jabref.logic.importer.fileformat;
 
 import java.nio.file.Path;
-import java.util.Collections;
 import java.util.List;
 import java.util.Optional;
 
@@ -10,87 +9,118 @@
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.types.StandardEntryType;
 
-import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 class PdfContentImporterTest {
 
-    private PdfContentImporter importer;
-
-    @BeforeEach
-    void setUp() {
-        importer = new PdfContentImporter();
-    }
+    private PdfContentImporter importer = new PdfContentImporter();
 
     @Test
     void doesNotHandleEncryptedPdfs() throws Exception {
         Path file = Path.of(PdfContentImporter.class.getResource("/pdfs/encrypted.pdf").toURI());
         List<BibEntry> result = importer.importDatabase(file).getDatabase().getEntries();
-        assertEquals(Collections.emptyList(), result);
+        assertEquals(List.of(), result);
     }
 
     @Test
     void importTwiceWorksAsExpected() throws Exception {
         Path file = Path.of(PdfContentImporter.class.getResource("/pdfs/minimal.pdf").toURI());
         List<BibEntry> result = importer.importDatabase(file).getDatabase().getEntries();
 
-        BibEntry expected = new BibEntry(StandardEntryType.InProceedings);
-        expected.setField(StandardField.AUTHOR, "1 ");
-        expected.setField(StandardField.TITLE, "Hello World");
-        expected.setFiles(Collections.singletonList(new LinkedFile("", file.toAbsolutePath(), "PDF")));
+        BibEntry expected = new BibEntry(StandardEntryType.InProceedings)
+                .withField(StandardField.AUTHOR, "1 ")
+                .withField(StandardField.TITLE, "Hello World")
+                .withFiles(List.of(new LinkedFile("", file.toAbsolutePath(), "PDF")));
+        assertEquals(List.of(expected), result);
 
         List<BibEntry> resultSecondImport = importer.importDatabase(file).getDatabase().getEntries();
-        assertEquals(Collections.singletonList(expected), result);
-        assertEquals(Collections.singletonList(expected), resultSecondImport);
+        assertEquals(List.of(expected), resultSecondImport);
     }
 
     @Test
     void parsingEditorWithoutPagesorSeriesInformation() {
-        BibEntry entry = new BibEntry(StandardEntryType.InProceedings);
-        entry.setField(StandardField.AUTHOR, "Anke Lüdeling and Merja Kytö (Eds.)");
-        entry.setField(StandardField.EDITOR, "Anke Lüdeling and Merja Kytö");
-        entry.setField(StandardField.PUBLISHER, "Springer");
-        entry.setField(StandardField.TITLE, "Corpus Linguistics – An International Handbook – Lüdeling, Anke, Kytö, Merja (Eds.)");
-
-        String firstPageContents = "Corpus Linguistics – An International Handbook – Lüdeling, Anke,\n" +
-                                   "Kytö, Merja (Eds.)\n" +
-                                   "\n" +
-                                   "Anke Lüdeling, Merja Kytö (Eds.)\n" +
-                                   "\n" +
-                                   "VOLUME 2\n" +
-                                   "\n" +
-                                   "This handbook provides an up-to-date survey of the field of corpus linguistics, a Handbücher zur Sprach- und\n" +
-                                   "field whose methodology has revolutionized much of the empirical work done in Kommunikationswissenschaft / Handbooks\n" +
-                                   "\n" +
-                                   "of Linguistics and Communication Science\n" +
-                                   "most fields of linguistic study over the past decade. (HSK) 29/2\n" +
-                                   "\n" +
-                                   "vii, 578 pages\n" +
-                                   "Corpus linguistics investigates human language by starting out from large\n";
+        BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
+                .withField(StandardField.AUTHOR, "Anke Lüdeling and Merja Kytö (Eds.)")
+                .withField(StandardField.EDITOR, "Anke Lüdeling and Merja Kytö")
+                .withField(StandardField.PUBLISHER, "Springer")
+                .withField(StandardField.TITLE, "Corpus Linguistics – An International Handbook – Lüdeling, Anke, Kytö, Merja (Eds.)");
+
+        String firstPageContents = """
+                Corpus Linguistics – An International Handbook – Lüdeling, Anke,
+                Kytö, Merja (Eds.)
+
+                Anke Lüdeling, Merja Kytö (Eds.)
+
+                VOLUME 2
+
+                This handbook provides an up-to-date survey of the field of corpus linguistics, a Handbücher zur Sprach- und
+                field whose methodology has revolutionized much of the empirical work done in Kommunikationswissenschaft / Handbooks
+
+                of Linguistics and Communication Science
+                most fields of linguistic study over the past decade. (HSK) 29/2
+
+                vii, 578 pages
+                Corpus linguistics investigates human language by starting out from large
+                """;
 
         assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n"));
     }
 
     @Test
     void parsingWithoutActualDOINumber() {
-        BibEntry entry = new BibEntry(StandardEntryType.InProceedings);
-        entry.withField(StandardField.AUTHOR, "Link to record in KAR and http://kar.kent.ac.uk/51043/  and Document Version and UNSPECIFIED  and Master of Research (MRes) thesis and University of Kent")
-             .withField(StandardField.TITLE, "Kent Academic Repository Full text document (pdf) Citation for published version Smith, Lucy Anna (2014) Mortality in the Ornamental Fish Retail Sector: an Analysis of Stock Losses and Stakeholder Opinions. DOI")
-             .withField(StandardField.YEAR, "5104");
-
-        String firstPageContents = "Kent Academic Repository Full text document (pdf)\n"
-                                   + "Citation for published version\n"
-                                   + "Smith, Lucy Anna (2014) Mortality in the Ornamental Fish Retail Sector: an Analysis of Stock\n"
-                                   + "Losses and Stakeholder Opinions.\n"
-                                   + "DOI\n\n\n"
-                                   + "Link to record in KAR\n"
-                                   + "http://kar.kent.ac.uk/51043/\n"
-                                   + "Document Version\n"
-                                   + "UNSPECIFIED\n"
-                                   + "Master of Research (MRes) thesis, University of Kent,.";
+        BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
+                .withField(StandardField.AUTHOR, "Link to record in KAR and http://kar.kent.ac.uk/51043/  and Document Version and UNSPECIFIED  and Master of Research (MRes) thesis and University of Kent")
+                .withField(StandardField.TITLE, "Kent Academic Repository Full text document (pdf) Citation for published version Smith, Lucy Anna (2014) Mortality in the Ornamental Fish Retail Sector: an Analysis of Stock Losses and Stakeholder Opinions. DOI")
+                .withField(StandardField.YEAR, "5104");
+
+        String firstPageContents = """
+                Kent Academic Repository Full text document (pdf)
+                Citation for published version
+                Smith, Lucy Anna (2014) Mortality in the Ornamental Fish Retail Sector: an Analysis of Stock
+                Losses and Stakeholder Opinions.
+                DOI
+
+                Link to record in KAR
+                http://kar.kent.ac.uk/51043/
+                Document Version
+                UNSPECIFIED
+                Master of Research (MRes) thesis, University of Kent,.""";
 
         assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n"));
     }
+
+    @Test
+    void extractDOIFromPage1() {
+        BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
+                .withField(StandardField.DOI, "10.1017/S0007114507795296")
+                .withField(StandardField.AUTHOR, "Review Article")
+                .withField(StandardField.TITLE, "British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 q The Authors")
+                .withField(StandardField.YEAR, "2008");
+
+        String firstPageContent = """
+                British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296
+                q The Authors 2008
+
+                Review Article
+
+                Cocoa and health: a decade of research
+
+                Karen A. Cooper1, Jennifer L. Donovan2, Andrew L. Waterhouse3 and Gary Williamson1*
+                1Nestlé Research Center, Vers-Chez-les-Blanc, PO Box 44, CH-1000 Lausanne 26, Switzerland
+                2Department of Psychiatry and Behavioural Sciences, Medical University of South Carolina, Charleston, SC 29425, USA
+                3Department of Viticulture & Enology, University of California, Davis, CA 95616, USA
+
+                (Received 5 December 2006 – Revised 29 May 2007 – Accepted 31 May 2007)
+
+                Abbreviations: FMD, flow-mediated dilation; NO, nitirc oxide.
+
+                *Corresponding author: Dr Gary Williamson, fax þ41 21 785 8544, email gary.williamson@rdls.nestle.com
+
+                British Journal of Nutrition
+                https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press""";
+
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
+    }
 }