diff --git a/CHANGELOG.md b/CHANGELOG.md index fb8618f..110d19a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # ChangeLog +## Version 2.4 + +- Reinstates [paquettg/php-html-parser](https://github.com/paquettg/php-html-parser) as the preferred DOM parser. + - Note that this updates many composer dependencies so releasing this as a separate release just in case. + ## Version 2.3 - Added ability to pass in HTML and process via the `processHTML` method diff --git a/composer.json b/composer.json index 1216000..e8967a8 100644 --- a/composer.json +++ b/composer.json @@ -12,7 +12,7 @@ "require": { "php": ">=7.2", "scotteh/php-goose": "dev-master", - "thesoftwarefanatics/php-html-parser": "^1.8.0", + "paquettg/php-html-parser": "^3.1.1", "detectlanguage/detectlanguage": "2.*", "andreskrey/readability.php": "^2.1.0" }, diff --git a/src/ArticleExtractor.php b/src/ArticleExtractor.php index b9148d1..a4fa4f8 100644 --- a/src/ArticleExtractor.php +++ b/src/ArticleExtractor.php @@ -11,8 +11,9 @@ use andreskrey\Readability\ParseException; use PHPHtmlParser\Dom; -use PHPHtmlParser\Dom\HtmlNode; -use PHPHtmlParser\Dom\TextNode; +use PHPHtmlParser\Options; +use PHPHtmlParser\Dom\Node\HtmlNode; +use PHPHtmlParser\Dom\Node\TextNode; use DetectLanguage\DetectLanguage; @@ -536,7 +537,7 @@ private function performCustomPostProcessing($html) { // Ok then try it a different way $dom = new Dom; - $dom->load($html, ['whitespaceTextNode' => false]); + $dom->loadStr($html, (new Options())->setWhitespaceTextNode(false)); // First, just completely remove the items we don't even care about $nodesToRemove = $dom->find('script, style, header, footer, input, button, aside, meta, link'); @@ -906,7 +907,7 @@ private function checkHTMLForLanguageHint($html_string) { try { // Ok then try it a different way $dom = new Dom; - $dom->load($html_string, ['whitespaceTextNode' => false]); + $dom->loadStr($html_string, (new Options())->setWhitespaceTextNode(false)); $htmltag = $dom->find('html'); $lang = $htmltag->getAttribute('lang');