diff --git a/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java b/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java index e269cc01..05ddb052 100644 --- a/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java +++ b/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java @@ -243,6 +243,9 @@ public boolean hasHTMLTags(String text){ aMap.put("sltrib.com", Arrays.asList( "#main-content > div.row" )); + aMap.put("sfchronicle.com", Arrays.asList( + "div[class=article-text]" + )); BEST_ELEMENT_PER_DOMAIN = Collections.unmodifiableMap(aMap); } @@ -290,7 +293,7 @@ public ArticleTextExtractor() { + "login|si(debar|gn|ngle)"); setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))" + "|arti(cle|kel)|instapaper_body|storybody|short-story|storycontent|articletext|story-primary|^newsContent$|dcontainer|announcement-details"); - setHighlyPositive("news-release-detail|storybody|main-content|articlebody|article_body|article-body|html-view-content|entry__body|^main-article$|^article__content$|^articleContent$|^mainEntityOfPage$|art_body_article|^article_text$|main-article-chapter|post-body"); + setHighlyPositive("news-detail-content|news-release-detail|storybody|main-content|articlebody|article_body|article-body|html-view-content|entry__body|^main-article$|^article__content$|^articleContent$|^mainEntityOfPage$|art_body_article|^article_text$|main-article-chapter|post-body"); setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|" + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|" + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|title|truncate|slider|^sectioncolumns$|ad-container"); diff --git a/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java b/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java index fe04ab33..e830f87c 100644 --- a/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java +++ b/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java @@ -3079,6 +3079,36 @@ public void testPublicNet() throws Exception { compareDates("2017-05-12 00:00:00", res.getDate()); } + @Test + public void testMorningStar() throws Exception { + // http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html + JResult res = new JResult(); + res.setUrl("http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html"); + res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("morningstar.html"))); + assertEquals("http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html", res.getCanonicalUrl()); + assertEquals("Hackers break into centralized password manager OneLogin", res.getTitle()); + assertTrue(res.getText(), res.getText().startsWith("NEW YORK (AP) — Hackers have gained access to OneLogin,")); + assertTrue(res.getText(), res.getText().endsWith("although not actual passwords.")); + assertEquals(StringUtils.EMPTY, res.getAuthorName()); + assertEquals(StringUtils.EMPTY, res.getAuthorDescription()); + compareDates("2017-06-02 00:00:00", res.getDate()); + } + + @Test + public void testSfchronicle() throws Exception { + // http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php + JResult res = new JResult(); + res.setUrl("http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php"); + res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("sfchronicle.html"))); + assertEquals("http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php", res.getCanonicalUrl()); + assertEquals("Odd-jobs matchmaker Thumbtack gets big funds, joins unicorn club - San Francisco Chronicle", res.getTitle()); + assertTrue(res.getText(), res.getText().startsWith("San Francisco’s Thumbtack, which introduces fix-it folks,")); + assertTrue(res.getText(), res.getText().endsWith("Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid")); + assertEquals("Carolyn Said", res.getAuthorName()); + assertEquals("Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid", res.getAuthorDescription()); + compareDates("2015-09-30 00:00:00", res.getDate()); + } + public static void compareDates(String expectedDateString, Date actual) { String[] patterns = { "yyyy-MM-dd", diff --git a/src/test/resources/de/jetwick/snacktory/morningstar.html b/src/test/resources/de/jetwick/snacktory/morningstar.html new file mode 100644 index 00000000..26ef1183 --- /dev/null +++ b/src/test/resources/de/jetwick/snacktory/morningstar.html @@ -0,0 +1,995 @@ + + + + + + Hackers break into centralized password manager OneLogin + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+
+
+ + + +
+
+ +
+
+
+ + +
+ +
+ + + + +

Hackers break into centralized password manager OneLogin

+ +
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ +
+ + + +
+ 06/02/17 12:02 PM EDT +
+ + +
+ +

NEW YORK (AP) — Hackers have gained access to OneLogin, an online password + manager that offers a single sign-on to multiple websites and services.

+ +
+
+ + +
+
+
+
+ +
+ + +

OneLogin disclosed the hack in a blog post but didn't specify the data + accessed in the breach.

+

Published reports, however, say OneLogin informed customers that the hackers + appeared to have gotten a way to access encrypted data. Passwords are + typically stored that way.

+

OneLogin didn't immediately respond to an inquiry.

+

Password managers help people keep track of passwords for a growing array of + websites and services that require one. Instead of having to remember + complex passwords for each one, people can just remember a master password. + The password service then unlocks other accounts as needed.

+

In 2015, rival LastPass said hackers obtained some user information — + although not actual passwords.

+
+
+
+
+
+
+
+ + + +
+ +
+
+
+ +
+ +
+ +
+
+
+ + + +
+
+ + +
+
+
+
+
+ + + +
+
+ + +
+
+
+
+ + + +
+ +
+
+
+ + +
+
+ + + +
+ + + + + + + + + + + \ No newline at end of file diff --git a/src/test/resources/de/jetwick/snacktory/sfchronicle.html b/src/test/resources/de/jetwick/snacktory/sfchronicle.html new file mode 100644 index 00000000..4f086a9b --- /dev/null +++ b/src/test/resources/de/jetwick/snacktory/sfchronicle.html @@ -0,0 +1,1646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Odd-jobs matchmaker Thumbtack gets big funds, joins unicorn club - San Francisco Chronicle + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ + + + + + +
+
+
+
+
+
+
+ +
+

Odd-jobs matchmaker Thumbtack gets big funds, joins unicorn club

+ + + + + September 30, 2015 + Updated: October 1, 2015 4:18pm + +
+ + +
+ + + + +
+ + +
+ + +
Geraldine Orozco, founder and creative director of Bay Area Candy Buffet, arranges flowers during a "dress rehearsal" to prepare to cater for a client's wedding on Wednesday, pictured Tuesday, April 21, 2015, at her home in Union City, Calif. Bay Area Candy Buffet caters sweets to clients. Orozco uses Thumbtack.com, which connects consumers to home professionals. Photo: Santiago Mejia, The Chronicle
+ + Photo: Santiago Mejia, The Chronicle + + + + + +
+
+ Geraldine Orozco, founder and creative director of Bay Area Candy Buffet, arranges flowers during a "dress rehearsal" to prepare to cater for a client's wedding on Wednesday, pictured Tuesday, April 21, 2015, at her home in Union City, Calif. Bay Area Candy Buffet caters sweets to clients. Orozco uses Thumbtack.com, which connects consumers to home professionals. + +
+ +
+ + + +

San Francisco’s Thumbtack, which introduces fix-it folks, caterers, yoga teachers and other professionals to people who need them, vaulted into the unicorn club this week after a $125 million funding round put its valuation at $1.3 billion.

+

“Our vision is to replace and reimagine a big chunk of the Yellow Pages,” said CEO Marco Zappacosta, who co-founded Thumbtack in 2009 after what he called a “banal observation: Why is it so damn hard to find a plumber?”

+

Thumbtack lets customers describe a job — anything from painting a house to walking a dog — with lots of specifics, including location, timing and budget. It sends those leads to service providers, who pay from $3 to $20 for the customer’s contact information so they can follow up with a quote. Each project can receive up to five quotes. “We’re more like a dating app than a traditional e-commerce site,” Zappacosta said.

+ + + + + +
+
+

After the initial introduction, Thumbtack bows out of the relationship, although at some point it might add optional back-end services such as a scheduling platform and payment system, he said.

+

The site facilitates more than 5 million projects a year, averaging about $500 each. Home improvement, Thumbtack’s biggest category, is a lucrative and highly fragmented market worth well over $500 million annually in the United States. It lists 200,000 professionals in 1,500 categories, including electricians, plumbers, painters, personal trainers, interior designers, gardeners, DJs, tutors, coaches, florists and makeup artists. Zappacosta once offered Italian cooking lessons and catering via the site, which he said was invaluable in learning a service pro’s perspective.

+

“Thumbtack sends us exactly what customers are looking for,” said Geraldine Chiaramonte, who runs Bay Area Candy Buffet, providing sweet treats for weddings and other occasions. Once she sees the requests, she decides whether to pay $3 per lead to follow up with a proposal. Up to a third of her business now comes from Thumbtack, after three years on the site.

+

A good analogy for what Thumbtack does is the Internet itself, said Bryan Schreier, a partner at Sequoia, which invested in Thumbtack’s current round and three prior ones. “There was a ton of information online, but it took Google to actually make that information useful and make it easier to connect consumers with what they were looking for,” he said. “This is a similar approach to solving a very big problem.”

+
Gluten-free sour watermelons from the Bay Area Candy Buffet, Tuesday, April 21, 2015, in Union City, Calif. Bay Area Candy Buffet caters sweets to clients. Photo: Santiago Mejia, The Chronicle
+ + Photo: Santiago Mejia, The Chronicle + + + + + +
+
+ Gluten-free sour watermelons from the Bay Area Candy Buffet, Tuesday, April 21, 2015, in Union City, Calif. Bay Area Candy Buffet caters sweets to clients. + +
+ +

So could Thumbtack grow as big as Google? “I don’t want to sound too over-the-top, but the local services market is much larger than the online advertising market,” Schreier said. “Thumbtack is a very large business already, when the reality is that they are tapping less than 1 percent of the available market today.”

+

That market’s gargantuan size has lured behemoths, with both Amazon and Google experimenting with their own home-repairs offerings. Google Capital is also an investor in Thumbtack, having led a $100 million round last year, as well as participating in the current round.

+

The latest funding brings Thumbtack’s total backing to $273.2 million. It was led by Scottish investment firm Baillie Gifford with several existing investors participating. Zappacosta said the money will go toward improving the product and marketing the brand. Thumbtack has 134 employees in San Francisco and 240 in Salt Lake City, where its customer service is based.

+
Geraldine Orozco, founder and creative director of Bay Area Candy Buffet, went through a "dress rehearsal" to prepare to cater for a client's wedding on Wednesday, pictured Tuesday, April 21, 2015, at her home in Union City, Calif. Bay Area Candy Buffet caters sweets to clients. Orozco uses Thumbtack.com, which connects consumers to home professionals. Photo: Santiago Mejia, The Chronicle
+ + Photo: Santiago Mejia, The Chronicle + + + + + +
+
+ Geraldine Orozco, founder and creative director of Bay Area Candy Buffet, went through a "dress rehearsal" to prepare to cater for a client's wedding on Wednesday, pictured Tuesday, April 21, 2015, at her home in Union City, Calif. Bay Area Candy Buffet caters sweets to clients. Orozco uses Thumbtack.com, which connects consumers to home professionals. + +
+ +

Some rivals such as TaskRabbit act as middlemen, arranging each service call in exchange for a cut. That model — like that of Uber, Lyft, Postmates and other gig economy services — has stirred up huge controversy about whether the independent-contractor workers should be reclassified as employees. That’s not an issue for Thumbtack, since it operates as a referral service.

+

Zappacosta said its less-controversial status drew Republican presidential candidate Jeb Bush, who rode in an Uber to Thumbtack for a heavily publicized campaign stop in July, shortly after Democratic front-runner Hillary Rodham Clinton criticized the gig economy’s lack of worker protections.

+

“We are unambiguously a force to empower business owners, not try to replace them,” Zappacosta said. “Jeb was trying to align with pro-business sentiment.”

+ +

Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid

+
+

+
Petit fours from the Bay Area Candy Buffet, Tuesday, April 21, 2015, in Union City, Calif. The company caters sweets to clients. Photo: Santiago Mejia, The Chronicle
+ + Photo: Santiago Mejia, The Chronicle + + + + + +
+
+ Petit fours from the Bay Area Candy Buffet, Tuesday, April 21, 2015, in Union City, Calif. The company caters sweets to clients. + +
+ +
+ + +
+ + + +
+
+ Carolyn Said +

+ Carolyn Said +

+

Business Reporter

+
+
+ +
+
+ +
+
+ +
+
+ + +
+ + + + + +
+
+
+
+ + +
+
+ + +
+ + + + + + + + +
+ +
+
+
+ +
+ +
+ + + + + + + + + +
+ +
+ +