Skip to content

Commit

Permalink
Merge pull request #6126 from JabRef/google-captcha
Browse files Browse the repository at this point in the history
Minor fetcher improvements
  • Loading branch information
koppor authored Mar 15, 2020
2 parents a71959f + 0f93e22 commit 0628f11
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 23 deletions.
25 changes: 13 additions & 12 deletions src/main/java/org/jabref/logic/importer/fetcher/ACS.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public class ACS implements FulltextFetcher {

/**
* Tries to find a fulltext URL for a given BibTex entry.
*
* <p>
* Currently only uses the DOI if found.
*
* @param entry The Bibtex entry
Expand All @@ -37,23 +37,24 @@ public class ACS implements FulltextFetcher {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

// DOI search
Optional<DOI> doi = entry.getField(StandardField.DOI).flatMap(DOI::parse);

if (doi.isPresent()) {
String source = String.format(SOURCE, doi.get().getDOI());
// Retrieve PDF link
Document html = Jsoup.connect(source).ignoreHttpErrors(true).get();
Element link = html.select("a.button_primary").first();
if (!doi.isPresent()) {
return Optional.empty();
}

String source = String.format(SOURCE, doi.get().getDOI());
// Retrieve PDF link
Document html = Jsoup.connect(source).ignoreHttpErrors(true).get();
Element link = html.select("a.button_primary").first();

if (link != null) {
LOGGER.info("Fulltext PDF found @ ACS.");
pdfLink = Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
}
if (link != null) {
LOGGER.info("Fulltext PDF found @ ACS.");
return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
}
return pdfLink;
return Optional.empty();
}

@Override
Expand Down
4 changes: 1 addition & 3 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,8 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
pdfUrl.ifPresent(url -> LOGGER.info("Fulltext PDF found @ arXiv."));

if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
}
return pdfUrl;
} catch (FetcherException e) {
LOGGER.warn("arXiv API request failed", e);
Expand Down
23 changes: 17 additions & 6 deletions src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

/**
* FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
*
* <p>
* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
*/
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
Expand All @@ -58,11 +58,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

// Search in title
if (!entry.hasField(StandardField.TITLE)) {
return pdfLink;
return Optional.empty();
}

try {
Expand All @@ -74,12 +73,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
// as_occt field to search in
uriBuilder.addParameter("as_occt", "title");

pdfLink = search(uriBuilder.toString());
return search(uriBuilder.toString());
} catch (URISyntaxException e) {
throw new FetcherException("Building URI failed.", e);
}

return pdfLink;
}

@Override
Expand All @@ -91,6 +88,11 @@ private Optional<URL> search(String url) throws IOException {
Optional<URL> pdfLink = Optional.empty();

Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get();

if (needsCaptcha(doc.body().html())) {
LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching.");
return Optional.empty();
}
// Check results for PDF link
// TODO: link always on first result or none?
for (int i = 0; i < NUM_RESULTS; i++) {
Expand All @@ -111,6 +113,10 @@ private Optional<URL> search(String url) throws IOException {
return pdfLink;
}

private boolean needsCaptcha(String body) {
return body.contains("id=\"gs_captcha_ccl\"");
}

@Override
public String getName() {
return "Google Scholar";
Expand Down Expand Up @@ -158,6 +164,11 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
String content = new URLDownload(queryURL).asString();

if (needsCaptcha(content)) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
}

Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
while (matcher.find()) {
String citationsPageURL = matcher.group().replace("&amp;", "&");
Expand Down
11 changes: 10 additions & 1 deletion src/test/java/org/jabref/logic/importer/fetcher/ACSTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

@FetcherTest
class ACSTest {

private ACS finder;
private BibEntry entry;

Expand Down Expand Up @@ -44,4 +43,14 @@ void notFoundByDOI() throws IOException {

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void entityWithoutDoi() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void trustLevel() {
assertEquals(TrustLevel.PUBLISHER, finder.getTrustLevel());
}
}
11 changes: 10 additions & 1 deletion src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

@FetcherTest
class ArXivTest {

private ArXiv finder;
private BibEntry entry;
private BibEntry sliceTheoremPaper;
Expand Down Expand Up @@ -121,6 +120,16 @@ void findFullTextByDOINotAvailableInCatalog() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void findFullTextEntityWithoutDoi() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void findFullTextTrustLevel() {
assertEquals(TrustLevel.PREPRINT, finder.getTrustLevel());
}

@Test
void searchEntryByPartOfTitle() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
Expand Down

0 comments on commit 0628f11

Please sign in to comment.