Skip to content

Commit

Permalink
Update to Version 2.0
Browse files Browse the repository at this point in the history
Closes #27.
Closes #26.
  • Loading branch information
crscheid committed Feb 26, 2020
1 parent dc40601 commit 81e2e39
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 21 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
/vendor
composer.lock
.phpunit.result.cache
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# ChangeLog

## Version 2.0

- Added ability to manually set User-Agent, fixing many readability issues
- Updated redirect detection logic to more accurately read HTTP headers.
- Updated dependencies
- Updated PHPUnit to ^8.0
- Updated andreskrey/readability.php to ^2.1.0
- Updated PHP dependency to ^7.2

## Version 1.0.1

- Fixed minor issue with `parse_url` check.
Expand Down
21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ array(5) {
}
```

You can also create the `ArticleExtractor` class by passing in a key for the language detection service. See more information below.
You can also create the `ArticleExtractor` class by passing in a key for the language detection service as well as a custom User-Agent string. See more information below.


## Language Detection Methods
## Options

### Language Detection Methods

Language detection is handled by either looking for language specifiers within the HTML meta data or by utilizing the [Detect Language](http://detectlanguage.com/) service.

Expand All @@ -53,14 +55,25 @@ If language detection fails or is not available, both of these fields will be re

[Detect Language](http://detectlanguage.com/) requires the use of an API KEY which you can sign up for. However, you can also use this library without it. If the HTML meta data do not contain information about the language of the article, then `language` and `language_method` will be returned as null values.

To utilize this library utilizing the language detection service, create the `ArticleExtractor` object by passing in your API KEY for [Detect Language](http://detectlanguage.com/) or by setting `DETECT_LANGUAGE_KEY` in your environment variables.
To utilize this library utilizing the language detection service, create the `ArticleExtractor` object by passing in your API KEY for [Detect Language](http://detectlanguage.com/).

```php
use Cscheide\ArticleExtractor\ArticleExtractor;

$extractor = new ArticleExtractor('your api key');
```

### Setting User Agent

It is possible to set the user-agent for outgoing requests. To do so pass the desired user agent string to the constructor as follows:

```php
use Cscheide\ArticleExtractor\ArticleExtractor;

$myUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36";
$extractor = new ArticleExtractor(null, $myUserAgent);
```


## Output Format

Expand Down Expand Up @@ -88,4 +101,4 @@ Unit tests are included in this distribution and can be run utilizing PHPUnit
./vendor/phpunit/phpunit/phpunit
```

Note: You may need to set the environment variable `DETECT_LANGUAGE_KEY` with your [Detect Language](http://detectlanguage.com/) key in order for language detection to work properly.
> Note: Please set the environment variable `DETECT_LANGUAGE_KEY` with your [Detect Language](http://detectlanguage.com/) key in order for language detection to work properly.
6 changes: 3 additions & 3 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
}
],
"require": {
"php": ">=5.5",
"php": ">=7.2",
"scotteh/php-goose": "dev-master",
"thesoftwarefanatics/php-html-parser": "^1.8.0",
"detectlanguage/detectlanguage": "2.*",
"andreskrey/readability.php": "^1.2.0"
"andreskrey/readability.php": "^2.1.0"
},
"require-dev": {
"phpunit/phpunit": "^6.0"
"phpunit/phpunit": "^8.0"
},
"autoload": {
"psr-4": {
Expand Down
5 changes: 2 additions & 3 deletions phpunit.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
processIsolation="false"
stopOnFailure="false"
syntaxCheck="false">
stopOnFailure="false">
<testsuites>
<testsuite name="Article Parser Test Suite">
<file>tests/ExtractorTest.php</file>
</testsuite>
</testsuites>
</phpunit>
</phpunit>
52 changes: 45 additions & 7 deletions src/ArticleExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
class ArticleExtractor {

// Debug flag - set to true for convenience during development
private $debug = false;
private $debug = true;

// Valid root elements we want to search for
private $valid_root_elements = [ 'body', 'form', 'main', 'div', 'ul', 'li', 'table', 'span', 'section', 'article', 'main'];
Expand All @@ -30,8 +30,12 @@ class ArticleExtractor {
// API key for the remote detection service
private $api_key = null;

public function __construct($api_key = null) {
// User agent to override
private $user_agent = null;

public function __construct($api_key = null, $user_agent = null) {
$this->api_key = $api_key;
$this->user_agent = $user_agent;
}


Expand Down Expand Up @@ -155,7 +159,15 @@ private function parseViaReadability($url) {
$readability = new Readability(new Configuration(['SummonCthulhu'=>true]));

try {
$html = file_get_contents($url);
if($this->user_agent != null) {
$this->log_debug("Manually setting user agent for file_get_contents to " . $this->user_agent);
$context = stream_context_create(array('http' => array('user_agent' => $this->user_agent)));
$html = file_get_contents($url, false, $context);
}
else {
$html = file_get_contents($url);
}

$readability->parse($html);
$title = $readability->getTitle();
$text = $readability->getContent();
Expand Down Expand Up @@ -351,12 +363,17 @@ private function checkForRedirects($url, $count = 0) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, true); // exclude the body from the request, we only want the header here
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);

// NOTE: We don't set user-agent here because many of the redirect services will use meta refresh instead of location headers to redirect.

$a = curl_exec($ch);

if(preg_match('#[Ll]ocation: (.*)#', $a, $r)) {
$new_url = trim($r[1]);
$new_url = $this->findLocationHeader($a);

if($new_url != null) {
$this->log_debug("Redirect found to: " . $new_url);

// Check to see if new redirect has scheme and host
Expand Down Expand Up @@ -385,6 +402,26 @@ private function checkForRedirects($url, $count = 0) {
}
}

/**
* Looks for "Location:" or "location:" in the header. Returns null if it can't find it.
*/
private function findLocationHeader($text) {

$lines = explode("\n", $text);

foreach($lines as $line) {

$header_item = explode(":", $line);

if (mb_strtolower($header_item[0]) == "location") {
$url = trim(mb_substr($line, mb_strpos($line,":")+1));
return $url;
}
}

return null;
}

/**
* Shifts encoding to UTF if needed
*/
Expand Down Expand Up @@ -534,7 +571,7 @@ private function identifyLanguage($text) {
return false;
}

try {
try {
// Set the API key for detect language library
DetectLanguage::setApiKey($this->api_key);

Expand Down Expand Up @@ -599,7 +636,8 @@ private function checkHTMLForLanguageHint($html_string) {

}

/**
/* *
*
function translateText($text, $targetLang)
{
$baseUrl = "https://translate.yandex.net/api/v1.5/tr.json/translate?key=YOUR_yandex_api_key";
Expand Down
14 changes: 10 additions & 4 deletions tests/ExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
class ExtractorTest extends TestCase {

private $problem_sites = [
'https://slashdot.org/story/18/07/19/2050244/microsofts-plan-to-try-to-win-back-consumers-with-modern-life-services',
'http://feedproxy.google.com/~r/businessinsider/~3/EChmgXESt_4/wells-fargo-close-settlement-end-probes-sales-practices-federal-prosecutors-2020-2-1028927535', // Issue #26 multiple redirects occur when browser user-agent not set
'http://www.businesswire.com/news/home/20200213005014/en/ID-Solutions-S.r.l.-Murata-ID-Solutions-S.r.l./?feedref=JjAwJuNHiystnCoBq_hl-fLcmYSZsqlD_XPbplM8Ta6D8R-QU5o2AvY8bhI9uvWSD8DYIYv4TIC1g1u0AKcacnnViVjtb72bOP4-4nHK5iej_DoWrIhfD31cAxcB60aE', // Redirect detection issue
'http://feeds.reuters.com/~r/reuters/companyNews/~3/vaJcALwyZeA/mexico-k', // Issue #23 301 redirects to incomplete URL
'https://www.bbc.co.uk/news/uk-politics-47379565', // Issue #23 301 redirects to incomplete URL
'https://www.fastcompany.com/3067246/innovation-agents/the-unexpected-design-challenge-behind-slacks-new-threaded-conversations',
Expand Down Expand Up @@ -36,18 +39,21 @@ class ExtractorTest extends TestCase {
// Temporary redirect 307 to terms of service violation which prevents link from resolving
'https://www.bloomberg.com/news/articles/2018-07-12/jpmorgan-wells-fargo-may-go-back-to-basics-with-loans-in-focus',

// Multiple redirects
'https://slashdot.org/story/18/07/19/2050244/microsofts-plan-to-try-to-win-back-consumers-with-modern-life-services',

// Iframes
'http://feeds.bizjournals.com/~r/bizj_washington/~3/043koKcU8Zk/walter-reed-project-signs-day-care-preschool.html',

];

public function testProblemSites()
{

$testUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36";

echo "\n";

foreach($this->problem_sites as $url) {
$parser = new ArticleExtractor(getenv('DETECT_LANGUAGE_KEY'));
// $parser = new ArticleExtractor(getenv('DETECT_LANGUAGE_KEY'));
$parser = new ArticleExtractor(getenv('DETECT_LANGUAGE_KEY'), $testUserAgent);
echo "Testing: " . $url . "\n";

$result = $parser->processURL($url);
Expand Down

0 comments on commit 81e2e39

Please sign in to comment.