Skip to content

Commit

Permalink
Added ability to override/force the reading method.
Browse files Browse the repository at this point in the history
  • Loading branch information
crscheid committed Dec 21, 2020
1 parent fe41669 commit 28696cc
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 104 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# ChangeLog


## Version 2.2

- Added ability to override/force the reading method.

## Version 2.1

- Added handling of common Google referral URLs
- Added 'result_url' to the return structure to inform the caller what the resultant URL was after redirects


## Version 2.0.1

- Turned off debugging left on by mistake
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,16 @@ $myUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.
$extractor = new ArticleExtractor(null, $myUserAgent);
```

### Force Reading Method

It is possible to force the method by which the reading is attempted, either with Readability, Goose, or Goose with our custom processing. This can come in handy where Readability or Goose have particular issues with particular websites.

To force the method, simply provide a third argument to the constructor as such. The four valid methods are `readability`, `goose`, `goosecustom`, or `custom`.

```php
$extractor = new ArticleExtractor(null, null, "goose");
```


## Output Format

Expand Down
271 changes: 170 additions & 101 deletions src/ArticleExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
class ArticleExtractor {

// Debug flag - set to true for convenience during development
private $debug = false;
private $debug = true;

// Valid root elements we want to search for
private $valid_root_elements = [ 'body', 'form', 'main', 'div', 'ul', 'li', 'table', 'span', 'section', 'article', 'main'];
Expand All @@ -33,9 +33,19 @@ class ArticleExtractor {
// User agent to override
private $user_agent = null;

public function __construct($api_key = null, $user_agent = null) {
// Method to force
private $force_method = null;

public function __construct($api_key = null, $user_agent = null, $force_method = null) {
$this->api_key = $api_key;
$this->user_agent = $user_agent;

if (!in_array($force_method, ['readability','goose','goosecustom','custom'])) {
$this->force_method = null;
}
else {
$this->force_method = $force_method;
}
}


Expand Down Expand Up @@ -74,13 +84,43 @@ public function processURL($url) {

$this->log_debug("Attempting to parse " . $url);

// First attempt to parse the URL into the structure we want
$results = $this->parseViaReadability($url);
// If we don't have a force method enabled, then simply run them in the following order
if ($this->force_method == null) {

// First try with readability
$results = $this->parseViaReadability($url);

// If we don't see what we want, try our other method
if ($results['text'] == null) {
$results = $this->parseViaGoose($url);
}

// If we still don't have text, then try our custom method passing in the results from the prior Goose Call
if ($results['text'] == null) {
$results = $this->parseViaCustom($url, $results);
}
}
// Otherwise, run them specifically
else {
switch($this->force_method) {
case 'readability':
$results = $this->parseViaReadability($url);
break;
case 'goose':
$results = $this->parseViaGoose($url);
break;
case 'goosecustom':
$results = $this->parseViaGoose($url);
if ($results['text'] == null) {
$results = $this->parseViaCustom($url, $results);
}
break;
case 'custom':
$results = $this->parseViaCustom($url);
break;
}
}

// If we don't see what we want, try our other method
if ($results['text'] == null) {
$results = $this->parseViaGooseOrCustom($url);
}

// Add the resultant URL after redirects
$results['result_url'] = $url;
Expand Down Expand Up @@ -200,19 +240,18 @@ private function parseViaReadability($url) {
}
/**
* Attempts to parse via the Goose libary and our custom processing. Returns the
* following array.
/**
* Attempts to parse via the Goose libary Returns the following array.
* [
* 'method' => "goose" | "custom" | null
* 'method' => "goose" | null
* 'title' => <the title of the article>
* 'text' => <the cleaned text of the article> | null
* 'html' => <the raw HTML of the article>
* ]
*
* Parsing can be considered unavailable if 'text' is returned as null
*/
private function parseViaGooseOrCustom($url) {
private function parseViaGoose($url) {

$text = null;
$method = "goose";
Expand All @@ -228,123 +267,153 @@ private function parseViaGooseOrCustom($url) {
$article = $goose->extractContent($url);
$title = $article->getTitle();
$html = $article->getRawHtml();
$text = $article->getCleanedArticleText();
// If Goose failed, $text will be null here

}
catch (\Exception $e) {
$this->log_debug('parseViaGoose: Unable to request url ' . $url . " due to " . $e->getMessage());
}

return ['parse_method'=>$method, 'title'=>$title, 'text'=>$text, 'html'=>$html];
}


/**
* Attempts to parse via the Goose libary and our custom processing. Returns the
* following array.
* [
* 'method' => "custom"
* 'title' => <the title of the article>
* 'text' => <the cleaned text of the article> | null
* 'html' => <the raw HTML of the article>
* ]
*
* Parsing can be considered unavailable if 'text' is returned as null
*/
private function parseViaCustom($url, $priorResults = null) {

// If Goose failed
if ($article->getCleanedArticleText() == null) {
$method = "custom";
$text = null;

// Get the HTML from goose
$html_string = $article->getRawHtml();
$this->log_debug("Parsing via: custom method");

//$this->log_debug("---- RAW HTML -----------------------------------------------------------------------------------");
//$this->log_debug($html_string);
//$this->log_debug("-------------------------------------------------------------------------------------------------");
try {

// Ok then try it a different way
$dom = new Dom;
$dom->load($html_string, ['whitespaceTextNode' => false]);
if($priorResults == null) {
// Try to get the title and HTML text from Goose first
$this->log_debug("Downloading HTML via Goose");
$goose = new GooseClient(['image_fetch_best' => false]);
$article = $goose->extractContent($url);
$title = $article->getTitle();
$html = $article->getRawHtml();
}
else {
$this->log_debug("Using prior HTML and title from Goose");
$title = $priorResults['title'];
$html = $priorResults['html'];
}

// First, just completely remove the items we don't even care about
$nodesToRemove = $dom->find('script, style, header, footer, input, button, aside, meta, link');
//$this->log_debug("---- RAW HTML -----------------------------------------------------------------------------------");
//$this->log_debug($html);
//$this->log_debug("-------------------------------------------------------------------------------------------------");

foreach($nodesToRemove as $node) {
$node->delete();
unset($node);
}
// Ok then try it a different way
$dom = new Dom;
$dom->load($html, ['whitespaceTextNode' => false]);

// Records to store information on the best dom element found thusfar
$best_element = null;
$best_element_wc = 0;
$best_element_wc_ratio = -1;
// First, just completely remove the items we don't even care about
$nodesToRemove = $dom->find('script, style, header, footer, input, button, aside, meta, link');

// $html = $dom->outerHtml;
foreach($nodesToRemove as $node) {
$node->delete();
unset($node);
}

// Get a list of qualifying nodes we want to evaluate as the top node for content
$candidateNodes = $this->buildAllNodeList($dom->root);
$this->log_debug("Candidate node count: " . count($candidateNodes));
// Records to store information on the best dom element found thusfar
$best_element = null;
$best_element_wc = 0;
$best_element_wc_ratio = -1;

// Find a target best element
foreach($candidateNodes as $node) {
// $html = $dom->outerHtml;

// Calculate the wordcount, whitecount, and wordcount ratio for the text within this element
$this_element_wc = str_word_count($node->text(true));
$this_element_whitecount = substr_count($node->text(true), ' ');
$this_element_wc_ratio = -1;
// Get a list of qualifying nodes we want to evaluate as the top node for content
$candidateNodes = $this->buildAllNodeList($dom->root);
$this->log_debug("Candidate node count: " . count($candidateNodes));

// If the wordcount is not zero, then calculation the wc ratio, otherwise set it to -1
$this_element_wc_ratio = ($this_element_wc == 0) ? -1 : $this_element_whitecount / $this_element_wc;
// Find a target best element
foreach($candidateNodes as $node) {

// Calculate the word count contribution for all children elements
$children_wc = 0;
$children_num = 0;
foreach($node->getChildren() as $child) {
if (in_array($child->tag->name(),$this->valid_root_elements)) {
$children_num++;
$children_wc += str_word_count($child->text(true));
}
}
// Calculate the wordcount, whitecount, and wordcount ratio for the text within this element
$this_element_wc = str_word_count($node->text(true));
$this_element_whitecount = substr_count($node->text(true), ' ');
$this_element_wc_ratio = -1;

// This is the contribution for this particular element not including the children types above
$this_element_wc_contribution = $this_element_wc - $children_wc;
// If the wordcount is not zero, then calculation the wc ratio, otherwise set it to -1
$this_element_wc_ratio = ($this_element_wc == 0) ? -1 : $this_element_whitecount / $this_element_wc;

// Debug information on this element for development purposes
$this->log_debug("Element:\t". $node->tag->name() . "\tTotal WC:\t" . $this_element_wc . "\tTotal White:\t" . $this_element_whitecount . "\tRatio:\t" . number_format($this_element_wc_ratio,2) . "\tElement WC:\t" . $this_element_wc_contribution . "\tChildren WC:\t" . $children_wc . "\tChild Contributors:\t" . $children_num . "\tBest WC:\t" . $best_element_wc . "\tBest Ratio:\t" . number_format($best_element_wc_ratio,2) . " " . $node->getAttribute('class'));
// Calculate the word count contribution for all children elements
$children_wc = 0;
$children_num = 0;
foreach($node->getChildren() as $child) {
if (in_array($child->tag->name(),$this->valid_root_elements)) {
$children_num++;
$children_wc += str_word_count($child->text(true));
}
}

// Now check to see if this element appears better than any previous one
// This is the contribution for this particular element not including the children types above
$this_element_wc_contribution = $this_element_wc - $children_wc;

// We do this by first checking to see if this element's WC contribution is greater than the previous
if ($this_element_wc_contribution > $best_element_wc) {
// Debug information on this element for development purposes
$this->log_debug("Element:\t". $node->tag->name() . "\tTotal WC:\t" . $this_element_wc . "\tTotal White:\t" . $this_element_whitecount . "\tRatio:\t" . number_format($this_element_wc_ratio,2) . "\tElement WC:\t" . $this_element_wc_contribution . "\tChildren WC:\t" . $children_wc . "\tChild Contributors:\t" . $children_num . "\tBest WC:\t" . $best_element_wc . "\tBest Ratio:\t" . number_format($best_element_wc_ratio,2) . " " . $node->getAttribute('class'));

// If we so we then calculate the improvement ratio from the prior best and avoid division by 0
$wc_improvement_ratio = ($best_element_wc == 0) ? 100 : $this_element_wc_contribution / $best_element_wc;
// Now check to see if this element appears better than any previous one

// There are three conditions in which this candidate should be chosen
// 1. The previous best is zero
// 2. The new best is more than 10% greater WC contribution than the prior best
// 3. The new element wc ratio is less than the existing best element's ratio
// We do this by first checking to see if this element's WC contribution is greater than the previous
if ($this_element_wc_contribution > $best_element_wc) {

if ( $best_element_wc == 0 || $wc_improvement_ratio >= 1.10 || $this_element_wc_ratio <= $best_element_wc_ratio) {
$best_element_wc = $this_element_wc_contribution;
$best_element_wc_ratio = $this_element_wc_ratio;
$best_element = $node;
$this->log_debug("\t *** New best element ***");
}
}
}
// If we so we then calculate the improvement ratio from the prior best and avoid division by 0
$wc_improvement_ratio = ($best_element_wc == 0) ? 100 : $this_element_wc_contribution / $best_element_wc;

// If we have a candidate element
if ($best_element) {
// There are three conditions in which this candidate should be chosen
// 1. The previous best is zero
// 2. The new best is more than 10% greater WC contribution than the prior best
// 3. The new element wc ratio is less than the existing best element's ratio

// Now we need to do some sort of peer analysis
$best_element = $this->peerAnalysis($best_element);
if ( $best_element_wc == 0 || $wc_improvement_ratio >= 1.10 || $this_element_wc_ratio <= $best_element_wc_ratio) {
$best_element_wc = $this_element_wc_contribution;
$best_element_wc_ratio = $this_element_wc_ratio;
$best_element = $node;
$this->log_debug("\t *** New best element ***");
}
}
}

/*
// Add space before HTML elements that if removed create concatenation issues (e.g. <p>, <li>)
$nodesToEditText = $best_element->find('p, li');
// If we have a candidate element
if ($best_element) {

foreach($nodesToEditText as $node) {
$node->setText(" " . $node->text);
}
// Now we need to do some sort of peer analysis
$best_element = $this->peerAnalysis($best_element);

*/
//
// Decode the text
//$text = html_entity_decode($best_element->text(true));
$text = html_entity_decode($this->getTextForNode($best_element));
/*
// Add space before HTML elements that if removed create concatenation issues (e.g. <p>, <li>)
$nodesToEditText = $best_element->find('p, li');
// Set the method so the caller knows which one was used
$method = "custom";
}
else {
$method = null;
}
}
else {
$text = $article->getCleanedArticleText();
}
foreach($nodesToEditText as $node) {
$node->setText(" " . $node->text);
}
*/
//
// Decode the text
//$text = html_entity_decode($best_element->text(true));
$text = html_entity_decode($this->getTextForNode($best_element));

}
}
catch (\Exception $e) {
$this->log_debug('parseViaGooseOrCustom: Unable to request url ' . $url . " due to " . $e->getMessage());
$this->log_debug('parseViaGoose: Unable to request url ' . $url . " due to " . $e->getMessage());
}

return ['parse_method'=>$method, 'title'=>$title, 'text'=>$text, 'html'=>$html];
Expand Down
Loading

0 comments on commit 28696cc

Please sign in to comment.