Skip to content

Commit

Permalink
Merge branch 'dev' - v0.7.3 release to master
Browse files Browse the repository at this point in the history
  • Loading branch information
daveajones committed Mar 29, 2018
2 parents e038ef6 + 35176ee commit 2b76fea
Show file tree
Hide file tree
Showing 320 changed files with 487 additions and 266,436 deletions.
6 changes: 5 additions & 1 deletion includes/util.php
Original file line number Diff line number Diff line change
Expand Up @@ -1549,7 +1549,7 @@ function fetchFeedUrl($url, $subcount = 0, $sysver = '', $timeout = 30)


//Gets the data from a URL along with extra info returned */
function fetchUrlExtra($url, $timeout = 30)
function fetchUrlExtra($url, $timeout = 30, $referer = "")
{
$url = clean_url($url);

Expand All @@ -1568,6 +1568,10 @@ function fetchUrlExtra($url, $timeout = 30)
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2);

if(!empty($referer)) {
curl_setopt($curl, CURLOPT_REFERER, $referer);
}

$data = curl_exec($curl);
$response = curl_getinfo($curl);

Expand Down
28 changes: 28 additions & 0 deletions libraries/readability-php/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,34 @@ All notable changes to this project will be documented in this file.

## Unreleased

## [v1.2.0](https://github.com/andreskrey/readability.php/releases/tag/v1.2.0)

- Merged PR#49 (Missing object when calling `->getContent()`)
- Imported all changes from Readability.js as of 2 March 2018 ([8525c6a](https://github.com/mozilla/readability/commit/8525c6af36d3badbe27c4672a6f2dd99ddb4097f)):
- Check for `<base>` elements before converting URLs to absolute.
- Clean `<link>` tags on `prepArticle()`
- Attempt to return at least some text if all the algorithm runs fail (Check PR [#423](https://github.com/mozilla/readability/pull/423) on JS version)
- Add new test cases for the previous changes
- And all other changes reflected [in this diff](https://github.com/mozilla/readability/compare/c3ff1a2d2c94c1db257b2c9aa88a4b8fbeb221c5...8525c6af36d3badbe27c4672a6f2dd99ddb4097f)

## [v1.1.1](https://github.com/andreskrey/readability.php/releases/tag/v1.1.1)

- Switched from assertEquals to assertSame on unit testing to avoid weak comparisons.
- Added a safe check to avoid sending the DOMDocument as a node when scanning for node ancestors.
- Fix issue #45: Small mistake in documentation
- Fix issue #46: Added `data-src` as a image source path
- Fixed bug when extracting all the image of the article (Was extracting images from the original DOM instead of the parsed one)
- Added the `->getDOMDocument()` getter to retrieve the fully parsed DOMDocument
- Merged PR #48 that allows passing an array as configuration (@topotru)

## [v1.1.0](https://github.com/andreskrey/readability.php/releases/tag/v1.1.0)

- Added 'data-orig' as an URL source for images
- Removed 'modal' as a negative property from classes
- Added option to inject a logger
- Removed all references to the `data-readability` tags that don't apply anymore to the new structure
- Merged PR #38 (Missing DOMEntityReference)

## [v1.0.0](https://github.com/andreskrey/readability.php/releases/tag/v1.0.0)

- Node encapsulation is gone. Pre v1 all nodes where encapsulated in a Readability class, which created lots of trouble with dependencies, responsibilities, and properties. Now all the encapsulation is gone: all the DOMNodes inside the Readability class are extensions of the original DOM classes, which allows the system to take advantage of the functions and properties of DOMDocument.
Expand Down
48 changes: 37 additions & 11 deletions libraries/readability-php/README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# Readability.php
[![Latest Stable Version](https://poser.pugx.org/andreskrey/readability.php/v/stable)](https://packagist.org/packages/andreskrey/readability.php) [![StyleCI](https://styleci.io/repos/71042668/shield?branch=master)](https://styleci.io/repos/71042668) [![Build Status](https://travis-ci.org/andreskrey/readability.php.svg?branch=master)](https://travis-ci.org/andreskrey/readability.php) [![Total Downloads](https://poser.pugx.org/andreskrey/readability.php/downloads)](https://packagist.org/packages/andreskrey/readability.php) [![Monthly Downloads](https://poser.pugx.org/andreskrey/readability.php/d/monthly)](https://packagist.org/packages/andreskrey/readability.php)
[![Latest Stable Version](https://poser.pugx.org/andreskrey/readability.php/v/stable)](https://packagist.org/packages/andreskrey/readability.php) [![Build Status](https://travis-ci.org/andreskrey/readability.php.svg?branch=master)](https://travis-ci.org/andreskrey/readability.php) [![Coverage Status](https://coveralls.io/repos/github/andreskrey/readability.php/badge.svg?branch=master)](https://coveralls.io/github/andreskrey/readability.php/?branch=master) [![StyleCI](https://styleci.io/repos/71042668/shield?branch=master)](https://styleci.io/repos/71042668) [![Total Downloads](https://poser.pugx.org/andreskrey/readability.php/downloads)](https://packagist.org/packages/andreskrey/readability.php) [![Monthly Downloads](https://poser.pugx.org/andreskrey/readability.php/d/monthly)](https://packagist.org/packages/andreskrey/readability.php)

PHP port of *Mozilla's* **[Readability.js](https://github.com/mozilla/readability)**. Parses html text (usually news and other articles) and returns **title**, **author**, **main image** and **text content** without nav bars, ads, footers, or anything that isn't the main body of the text. Analyzes each node, gives them a score, and determines what's relevant and what can be discarded.

![Screenshot](https://raw.githubusercontent.com/andreskrey/readability.php/assets/screenshot.png)

The project aim is to be a 1 to 1 port of Mozilla's version and to follow closely all changes introduced there, but there are some major differences on the structure. Most of the code is a 1:1 copy –even the comments were imported– but some functions and structures were adapted to suit better the PHP language.

**Lead Developer**: Andres Rey

## Requirements

PHP 5.6+, ext-dom, ext-xml, and ext-mbstring. To install all this dependencies (in the rare case your system does not have them already), you could try something like this in *nix like environments:

`$ sudo apt-get install php7.1-xml php7.1-mbstring`

**Lead Developer**: Andres Rey

## How to use it

First you have to require the library using composer:
Expand All @@ -24,7 +24,7 @@ First you have to require the library using composer:
Then, create a Readability class and pass a Configuration class, feed the `parse()` function with your HTML and echo the variable:

```php
use andreskrey\Readability\HTMLParser;
use andreskrey\Readability\Readability;
use andreskrey\Readability\Configuration;

$readability = new Readability(new Configuration());
Expand All @@ -35,7 +35,7 @@ try {
$readability->parse($html);
echo $readability;
} catch (ParseException $e) {
echo sprintf('Error processing text: %s', $e->getMessage);
echo sprintf('Error processing text: %s', $e->getMessage());
}
```

Expand All @@ -60,31 +60,57 @@ Here's a list of the available properties:
- Author: `->getAuthor();`
- Text direction (ltr or rtl): `->getDirection();`

If you need to tweak the final HTML you can get the DOMDocument of the result by calling `->getDOMDocument()`.

## Options

You can change the behaviour of Readability via the Configuration object. For example, if you want to fix relative URLs and declare the original URL, you could set up the configuration like this:

```php
$configuration = new Configuration();
$configuration->setFixRelativeURLs(true)
$configuration
->setFixRelativeURLs(true)
->setOriginalURL('http://my.newspaper.url/article/something-interesting-to-read.html');
```
Also you can pass an array of configuration parameters to the constructor:
```php
$configuration = new Configuration([
'fixRelativeURLs' => true,
'originalURL' => 'http://my.newspaper.url/article/something-interesting-to-read.html',
// other parameters ... listing below
]);
```


Then you pass this Configuration object to Readability. The following options are available. Remember to prepend `set` when calling them.
Then you pass this Configuration object to Readability. The following options are available. Remember to prepend `set` when calling them using native setters.

- **MaxTopCandidates**: default value `5`, max amount of top level candidates.
- **WordThreshold**: default value `500`, minimum amount of characters to consider that the article was parsed successful.
- **ArticleByLine**: default value `false`, search for the article byline and remove it from the text. It will be moved to the article metadata.
- **StripUnlikelyCandidates**: default value `true`, remove nodes that are unlikely to have relevant information. Useful for debugging or parsing complex or non-standard articles.
- **CleanConditionally**: default value `true`, remove certain nodes after parsing to return a cleaner result.
- **WeightClasses**: default value `true`, weight classes during the rating phase.
- **RemoveReadabilityTags**: default value `true`, remove the data-readability tags inside the nodes that are added during the rating phase.
- **FixRelativeURLs**: default value `false`, convert relative URLs to absolute. Like `/test` to `http://host/test`.
- **SubstituteEntities**: default value `false`, disables the `substituteEntities` flag of libxml. Will avoid substituting HTML entities. Like `&aacute;` to á.
- **NormalizeEntities**: default value `false`, converts UTF-8 characters to its HTML Entity equivalent. Useful to parse HTML with mixed encoding.
- **OriginalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs.
- **SummonCthulhu**: default value `false`, remove all `<script>` nodes via regex. This is not ideal as it might break things, but might be the only solution to [libxml problems with unescaped javascript](https://github.com/andreskrey/readability.php#known-issues). If you're not parsing Javascript tutorials, it's recommended to always set this option as `true`.

### Debug log

Logging is optional and you will have to inject your own logger to save all the debugging messages. To do so, use a logger that implements the [PSR-3 logging interface](https://github.com/php-fig/log) and pass it to the configuration object. For example:

```
// Using monolog
$log = new Logger('Readability');
$log->pushHandler(new StreamHandler('path/to/my/log.txt'));
$configuration->setLogger($log);
```

In the log you will find information about the parsed nodes, why they were removed, and why they were considered relevant to the final article.

## Limitations

Of course the main limitation is PHP. Websites that load the content through lazy loading, AJAX, or any type of javascript fueled call will be ignored (actually, *not ran*) and the resulting text will be incorrect, compared to the readability.js results. All the articles you want to parse with readability.php need to be complete and all the content should be in the HTML already.
Expand Down Expand Up @@ -126,7 +152,7 @@ Self closing tags like `<br />` get automatically expanded to `<br></br`. No way

## Dependencies

Readability.php has no dependencies to other libraries.
Readability.php uses the [PSR Log](https://github.com/php-fig/log) interface to define the allowed type of loggers. [Monolog](https://github.com/Seldaek/monolog) is only required on development installations. (`--dev` option during `composer install`).

## To-do

Expand All @@ -139,7 +165,7 @@ Readability parses all the text with DOMDocument, scans the text nodes and gives

## Code porting

Up to date with readability.js as of [16 Oct 2017](https://github.com/mozilla/readability/commit/c3ff1a2d2c94c1db257b2c9aa88a4b8fbeb221c5).
Up to date with readability.js as of [2 Mar 2018](https://github.com/mozilla/readability/commit/8525c6af36d3badbe27c4672a6f2dd99ddb4097f).

## License

Expand All @@ -157,4 +183,4 @@ Based on Arc90's readability.js (1.7.1) script available at: http://code.google.
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
9 changes: 7 additions & 2 deletions libraries/readability-php/composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,14 @@
"php": ">=5.6.0",
"ext-dom": "*",
"ext-xml": "*",
"ext-mbstring": "*"
"ext-mbstring": "*",
"psr/log": "^1.0"
},
"require-dev": {
"phpunit/phpunit": "^5.7"
"phpunit/phpunit": "^5.7",
"monolog/monolog": "^1.23"
},
"suggest": {
"monolog/monolog": "Allow logging debug information"
}
}
108 changes: 79 additions & 29 deletions libraries/readability-php/src/Configuration.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,131 @@

namespace andreskrey\Readability;

//use Psr\Log\LoggerAwareTrait;
//use Psr\Log\LoggerInterface;
//use Psr\Log\NullLogger;

/**
* Class Configuration.
*/
class Configuration
{
// use LoggerAwareTrait;

/**
* @var int
*/
protected $maxTopCandidates = 5;

/**
* @var int
*/
protected $wordThreshold = 500;

/**
* @var bool
*/
protected $articleByLine = false;

/**
* @var bool
*/
protected $stripUnlikelyCandidates = true;

/**
* @var bool
*/
protected $cleanConditionally = true;

/**
* @var bool
*/
protected $weightClasses = true;
/**
* @var bool
*/
protected $removeReadabilityTags = true;

/**
* @var bool
*/
protected $fixRelativeURLs = false;

/**
* @var bool
*/
protected $substituteEntities = false;

/**
* @var bool
*/
protected $normalizeEntities = false;

/**
* @var bool
*/
protected $summonCthulhu = false;

/**
* @var string
*/
protected $originalURL = 'http://fakehost';

/**
* Configuration constructor.
*
* @param array $params
*/
public function __construct(array $params = [])
{
foreach ($params as $key => $value) {
$setter = sprintf('set%s', $key);
if (method_exists($this, $setter)) {
call_user_func([$this, $setter], $value);
}
}
}

/**
* Returns an array-representation of configuration.
*
* @return array
*/
public function toArray()
{
$out = [];
foreach ($this as $key => $value) {
$getter = sprintf('get%s', $key);
if (!is_object($value) && method_exists($this, $getter)) {
$out[$key] = call_user_func([$this, $getter]);
}
}

return $out;
}

/**
* @return LoggerInterface
*/
public function getLogger()
{
// If no logger has been set, just return a null logger
// if ($this->logger === null) {
// return new NullLogger();
// } else {
// return $this->logger;
// }
return null;
}

/**
* @param LoggerInterface $logger
*
* @return Configuration
*/
public function setLogger(LoggerInterface $logger)
{
$this->logger = $logger;

return $this;
}

/**
* @return int
*/
Expand Down Expand Up @@ -172,26 +247,6 @@ public function setWeightClasses($weightClasses)
return $this;
}

/**
* @return bool
*/
public function getRemoveReadabilityTags()
{
return $this->removeReadabilityTags;
}

/**
* @param bool $removeReadabilityTags
*
* @return $this
*/
public function setRemoveReadabilityTags($removeReadabilityTags)
{
$this->removeReadabilityTags = $removeReadabilityTags;

return $this;
}

/**
* @return bool
*/
Expand Down Expand Up @@ -291,9 +346,4 @@ public function setSummonCthulhu($summonCthulhu)

return $this;
}

/**
* @var bool
*/
protected $summonCthulhu = false;
}
2 changes: 2 additions & 0 deletions libraries/readability-php/src/Nodes/DOM/DOMDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ public function __construct($version, $encoding)
$this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
$this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
$this->registerNodeClass('DOMElement', DOMElement::class);
$this->registerNodeClass('DOMEntity', DOMEntity::class);
$this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
$this->registerNodeClass('DOMNode', DOMNode::class);
$this->registerNodeClass('DOMNotation', DOMNotation::class);
$this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
Expand Down
10 changes: 10 additions & 0 deletions libraries/readability-php/src/Nodes/DOM/DOMEntity.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php

namespace andreskrey\Readability\Nodes\DOM;

use andreskrey\Readability\Nodes\NodeTrait;

class DOMEntity extends \DOMEntity
{
use NodeTrait;
}
Loading

0 comments on commit 2b76fea

Please sign in to comment.