Skip to content

Commit

Permalink
Fixed extraction issues
Browse files Browse the repository at this point in the history
CRAW-170:
http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html
- Wrong content were extracted
- Added css selector rule in setHighlyPositive

CRAW-163:
http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php
- Link http://thumbtack.com/ was missing in links section of extracted contents
- The paragraph containing the http://thumbtack.com/ was missing from extracted contents.
- Added a css selector rule in BEST_ELEMENT_PER_DOMAIN
- Link is now extracted
      {
        "offset": "5013",
        "text": "Thumbtack",
        "url": "http://thumbtack.com"
      },
  • Loading branch information
Abhishek Mulay committed Jun 5, 2017
1 parent b49ba8e commit d93b115
Show file tree
Hide file tree
Showing 4 changed files with 2,675 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ public boolean hasHTMLTags(String text){
aMap.put("sltrib.com", Arrays.asList(
"#main-content > div.row"
));
aMap.put("sfchronicle.com", Arrays.asList(
"div[class=article-text]"
));

BEST_ELEMENT_PER_DOMAIN = Collections.unmodifiableMap(aMap);
}
Expand Down Expand Up @@ -290,7 +293,7 @@ public ArticleTextExtractor() {
+ "login|si(debar|gn|ngle)");
setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
+ "|arti(cle|kel)|instapaper_body|storybody|short-story|storycontent|articletext|story-primary|^newsContent$|dcontainer|announcement-details");
setHighlyPositive("news-release-detail|storybody|main-content|articlebody|article_body|article-body|html-view-content|entry__body|^main-article$|^article__content$|^articleContent$|^mainEntityOfPage$|art_body_article|^article_text$|main-article-chapter|post-body");
setHighlyPositive("news-detail-content|news-release-detail|storybody|main-content|articlebody|article_body|article-body|html-view-content|entry__body|^main-article$|^article__content$|^articleContent$|^mainEntityOfPage$|art_body_article|^article_text$|main-article-chapter|post-body");
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|title|truncate|slider|^sectioncolumns$|ad-container");
Expand Down
30 changes: 30 additions & 0 deletions src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3079,6 +3079,36 @@ public void testPublicNet() throws Exception {
compareDates("2017-05-12 00:00:00", res.getDate());
}

@Test
public void testMorningStar() throws Exception {
// http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html
JResult res = new JResult();
res.setUrl("http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("morningstar.html")));
assertEquals("http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html", res.getCanonicalUrl());
assertEquals("Hackers break into centralized password manager OneLogin", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("NEW YORK (AP) — Hackers have gained access to OneLogin,"));
assertTrue(res.getText(), res.getText().endsWith("although not actual passwords."));
assertEquals(StringUtils.EMPTY, res.getAuthorName());
assertEquals(StringUtils.EMPTY, res.getAuthorDescription());
compareDates("2017-06-02 00:00:00", res.getDate());
}

@Test
public void testSfchronicle() throws Exception {
// http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php
JResult res = new JResult();
res.setUrl("http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("sfchronicle.html")));
assertEquals("http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php", res.getCanonicalUrl());
assertEquals("Odd-jobs matchmaker Thumbtack gets big funds, joins unicorn club - San Francisco Chronicle", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("San Francisco’s Thumbtack, which introduces fix-it folks,"));
assertTrue(res.getText(), res.getText().endsWith("Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid"));
assertEquals("Carolyn Said", res.getAuthorName());
assertEquals("Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid", res.getAuthorDescription());
compareDates("2015-09-30 00:00:00", res.getDate());
}

public static void compareDates(String expectedDateString, Date actual) {
String[] patterns = {
"yyyy-MM-dd",
Expand Down
Loading

0 comments on commit d93b115

Please sign in to comment.