Skip to content

Commit

Permalink
Improve DoiResolution fetcher
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan-kolb committed Mar 15, 2020
1 parent 086629d commit 15c7981
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 55 deletions.
130 changes: 79 additions & 51 deletions src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.net.URLDownload;
import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.DOI;
Expand All @@ -27,68 +29,94 @@
* FulltextFetcher implementation that follows the DOI resolution redirects and scans for a full-text PDF URL.
*/
public class DoiResolution implements FulltextFetcher {

private static final Logger LOGGER = LoggerFactory.getLogger(DoiResolution.class);

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

Optional<DOI> doi = entry.getField(StandardField.DOI).flatMap(DOI::parse);

if (doi.isPresent()) {
String sciLink = doi.get().getURIAsASCIIString();

// follow all redirects and scan for a single pdf link
if (!sciLink.isEmpty()) {
try {
Connection connection = Jsoup.connect(sciLink);
// pretend to be a browser (agent & referrer)
connection.userAgent(URLDownload.USER_AGENT);
connection.referrer("http://www.google.com");
connection.followRedirects(true);
connection.ignoreHttpErrors(true);
// some publishers are quite slow (default is 3s)
connection.timeout(10000);

Document html = connection.get();

// scan for PDF
Elements elements = html.body().select("a[href]");
List<Optional<URL>> links = new ArrayList<>();

for (Element element : elements) {
String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH);
String hrefText = element.text().toLowerCase(Locale.ENGLISH);
// Only check if pdf is included in the link or inside the text
// ACM uses tokens without PDF inside the link
// See https://github.com/lehner/LocalCopy for more scrape ideas
if (element.attr("title").toLowerCase(Locale.ENGLISH).contains("pdf") && new URLDownload(href).isPdf()) {
return Optional.of(new URL(href));
}

if (href.contains("pdf") || hrefText.contains("pdf") && new URLDownload(href).isPdf()) {
links.add(Optional.of(new URL(href)));
}
}
// return if only one link was found (high accuracy)
if (links.size() == 1) {
LOGGER.info("Fulltext PDF found @ " + sciLink);
pdfLink = links.get(0);
}
} catch (UnsupportedMimeTypeException type) {
// this might be the PDF already as we follow redirects
if (type.getMimeType().startsWith("application/pdf")) {
return Optional.of(new URL(type.getUrl()));
}
LOGGER.warn("DoiResolution fetcher failed: ", type);
} catch (IOException e) {
LOGGER.warn("DoiResolution fetcher failed: ", e);
if (!doi.isPresent()) {
return Optional.empty();
}

String doiLink = doi.get().getURIAsASCIIString();
if (doiLink.isEmpty()) {
return Optional.empty();
}

// follow all redirects and scan for a single pdf link
try {
Connection connection = Jsoup.connect(doiLink);
// pretend to be a browser (agent & referrer)
connection.userAgent(URLDownload.USER_AGENT);
connection.referrer("http://www.google.com");
connection.followRedirects(true);
connection.ignoreHttpErrors(true);
// some publishers are quite slow (default is 3s)
connection.timeout(10000);

Document html = connection.get();
// scan for PDF
Elements hrefElements = html.body().select("a[href]");

List<URL> links = new ArrayList<>();
for (Element element : hrefElements) {
String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH);
String hrefText = element.text().toLowerCase(Locale.ENGLISH);
// Only check if pdf is included in the link or inside the text
// ACM uses tokens without PDF inside the link
// See https://github.com/lehner/LocalCopy for more scrape ideas
// link with "PDF" in title tag
if (element.attr("title").toLowerCase(Locale.ENGLISH).contains("pdf") && new URLDownload(href).isPdf()) {
return Optional.of(new URL(href));
}

if (href.contains("pdf") || hrefText.contains("pdf") && new URLDownload(href).isPdf()) {
links.add(new URL(href));
}
}

// return if only one link was found (high accuracy)
if (links.size() == 1) {
LOGGER.info("Fulltext PDF found @ " + doiLink);
return Optional.of(links.get(0));
}
// return if links are similar or multiple links are similar
return findSimilarLinks(links);
} catch (UnsupportedMimeTypeException type) {
// this might be the PDF already as we follow redirects
if (type.getMimeType().startsWith("application/pdf")) {
return Optional.of(new URL(type.getUrl()));
}
LOGGER.warn("DoiResolution fetcher failed: ", type);
} catch (IOException e) {
LOGGER.warn("DoiResolution fetcher failed: ", e);
}

return Optional.empty();
}

private Optional<URL> findSimilarLinks(List<URL> urls) {
List<URL> distinctLinks = urls.stream().distinct().collect(Collectors.toList());

if (distinctLinks.isEmpty()) {
return Optional.empty();
}
// equal
if (distinctLinks.size() == 1) {
return Optional.of(distinctLinks.get(0));
}
return pdfLink;
// similar
final String firstElement = distinctLinks.get(0).toString();
StringSimilarity similarity = new StringSimilarity();
List<URL> similarLinks = distinctLinks.stream().filter(elem -> similarity.isSimilar(firstElement, elem.toString())).collect(Collectors.toList());
if (similarLinks.size() == distinctLinks.size()) {
return Optional.of(similarLinks.get(0));
}

return Optional.empty();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ void setUp() {
}

@Test
@DisabledOnCIServer("CI server is blocked")
void findByDOI() throws IOException {
void linkWithPdfInTitleTag() throws IOException {
entry.setField(StandardField.DOI, "10.1051/0004-6361/201527330");

assertEquals(
Expand All @@ -37,17 +36,38 @@ void findByDOI() throws IOException {
);
}

@Test
void linkWithPdfStringLeadsToFulltext() throws IOException {
entry.setField(StandardField.DOI, "10.1002/acr2.11101");
assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry));
}

@Test
void multipleLinksWithSmallEditDistanceLeadToFulltext() throws IOException {
entry.setField(StandardField.DOI, "10.1002/acr2.11101");
assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry));
}

@Test
void notReturnAnythingWhenMultipleLinksAreFound() throws IOException {
entry.setField(StandardField.DOI, "10.1051/0004-6361/201527330; 10.1051/0004-6361/20152711233");
entry.setField(StandardField.DOI, "10.1109/JXCDC.2019.2911135");
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
@DisabledOnCIServer("CI server is blocked")
void notFoundByDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1186/unknown-doi");

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void entityWithoutDoi() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void trustLevel() {
assertEquals(TrustLevel.SOURCE, finder.getTrustLevel());
}
}

0 comments on commit 15c7981

Please sign in to comment.