Skip to content

Commit

Permalink
Merge pull request karussell#35 from skyshard/abhishek/nytimes_excess…
Browse files Browse the repository at this point in the history
…ive_content_extraction

Nytimes.com - Fixed content extraction
  • Loading branch information
andresp99999 authored Jun 5, 2017
2 parents b49ba8e + bc18003 commit 37b5d15
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ public boolean hasHTMLTags(String text){
aMap.put("inforisktoday", Arrays.asList(
"p:has(b):contains(See Also:)"
));
aMap.put("nytimes.com", Arrays.asList(
"[class*=hidden]"
));

NODES_TO_REMOVE_PER_DOMAIN = Collections.unmodifiableMap(aMap);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ public void testNytContentExtraction6() throws Exception {
assertEquals("Robert Mueller, Former F.B.I. Director, Is Named Special Counsel for Russia Investigation", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("WASHINGTON — The Justice Department appointed Robert S. Mueller III, a former F.B.I. director,"));
assertFalse(res.getText(), res.getText().contains("Please verify you're not a robot by clicking the box. Invalid email address."));
assertFalse(res.getText(), res.getText().contains("View all New York Times newsletters. See Sample Manage Email Preferences Not you? Privacy Policy"));
assertTrue(res.getText(), res.getText().endsWith("He’s the embodiment of integrity.”"));
assertEquals("Rebecca R. Ruiz and Mark Landler", res.getAuthorName());
assertEquals("https://www.nytimes.com/by/rebecca-r-ruiz", res.getAuthorDescription());
Expand Down
Empty file added tmp
Empty file.

0 comments on commit 37b5d15

Please sign in to comment.