From ad3919bf1fdf0e77817559316f6fedb668784cf1 Mon Sep 17 00:00:00 2001 From: Christopher Scheidel Date: Thu, 23 Jan 2020 17:08:32 -0500 Subject: [PATCH] Closes #25. Updates text output format cleaning up newlines related to HTML headers. --- CHANGELOG.md | 8 +++++++- README.md | 18 ++++++++++++++++++ src/ArticleExtractor.php | 37 ++++++++++++++----------------------- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73df312..917cfc3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # ChangeLog +## Version 1.0 + +- Updated to modify the approach for cleaning HTML tags and dealing with newlines. +- Updated README.md to outline the new text format. +- Closes issue #25 + ## Version 0.9 - Updated to include cleaning up of article text. @@ -11,7 +17,7 @@ ## Version 0.8.4 -- Resolved 301 redirects to incomplete URL +- Resolved 301 redirects to incomplete URL ## Version 0.8.3 diff --git a/README.md b/README.md index 69ad71a..505c1fb 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,24 @@ $extractor = new ArticleExtractor('your api key'); ``` +## Output Format + +As of version 1.0, the output format has been altered to provide newline breaks for headings. This is important especially for natural language processing applications in determining sentence boundaries. If this behavior is not desired, simply strip out the additional newlines where needed. + +This change was made due the fact that when header and paragraph HTML elements are simply stripped out, there often occurs issues where there is no separation between the heading and the proceeding sentence. + +**Example of Output Format for Text Field** + +``` +\n +A database containing 250 million Microsoft customer records has been found unsecured and online\n +NurPhoto via Getty Images\n +A new report reveals that 250 million Microsoft customer records, spanning 14 years, have been exposed online without password protection.\n +Microsoft has been in the news for, mostly, the wrong reasons recently. There is the Internet Explorer zero-day vulnerability that Microsoft hasn't issued a patch for, despite it being actively exploited. That came just days after the U.S. Government issued a critical Windows 10 update now alert concerning the "extraordinarily serious" curveball crypto vulnerability. Now a newly published report, has revealed that 250 million Microsoft customer records, spanning an incredible 14 years in all, have been exposed online in a database with no password protection.\n +What Microsoft customer records were exposed online, and where did they come from?\n +``` + + ## Running tests Unit tests are included in this distribution and can be run utilizing PHPUnit diff --git a/src/ArticleExtractor.php b/src/ArticleExtractor.php index 00d3725..f5811b5 100644 --- a/src/ArticleExtractor.php +++ b/src/ArticleExtractor.php @@ -134,26 +134,6 @@ public function processURL($url) { return $results; } - /** - Space between the words (and without line-break) - */ - private function rip_tags($string) { - - // ----- remove HTML TAGs ----- - $string = preg_replace ('/<[^>]*>/', ' ', $string); - - // ----- remove control characters ----- - $string = str_replace("\r", '', $string); // --- replace with empty space - $string = str_replace("\n", ' ', $string); // --- replace with space - $string = str_replace("\t", ' ', $string); // --- replace with space - - // ----- remove multiple spaces ----- - $string = trim(preg_replace('/ {2,}/', ' ', $string)); - - return $string; - - } - /** * Attempts to parse via the Readability libary aReturns the following array. @@ -179,10 +159,21 @@ private function parseViaReadability($url) { $readability->parse($html); $title = $readability->getTitle(); $text = $readability->getContent(); - $text = strip_tags($this->rip_tags($text)); // Remove all HTML tags + + // Replace all and tags with newlines + $text = preg_replace ('//', "\n", $text); + $text = preg_replace ('/<\/h[1-6]>/', "\n", $text); + $text = preg_replace ('/

/', "\n", $text); + $text = preg_replace ('/<\/p>/', "\n", $text); + + $text = strip_tags($text); // Remove all HTML tags $text = html_entity_decode($text); // Make sure we have no HTML entities left over - //$text = str_replace("\r\r", "\r", $text); // remove carriage returns - //$text = str_replace("\n\n", "\n", $text); // remove excessive line returns + + $text = str_replace("\t", " ", $text); // Replace tabs with spaces + $text = preg_replace('/ {2,}/', ' ', $text); // Remove multiple spaces + + $text = str_replace("\r", "\n", $text); // convert carriage returns to newlines + $text = preg_replace("/(\n)+/", "$1", $text); // remove excessive line returns } catch (ParseException $e) {