From aceae14a1fff0814b1d3f185eac6831a4d1c57b4 Mon Sep 17 00:00:00 2001 From: Michael Tibben Date: Mon, 22 Feb 2016 11:57:03 +1100 Subject: [PATCH] Always use mb_* functions --- composer.json | 4 ++++ src/Html2Text.php | 46 ++++++++++++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/composer.json b/composer.json index 9b59291..ba6299b 100644 --- a/composer.json +++ b/composer.json @@ -8,5 +8,9 @@ }, "require-dev": { "phpunit/phpunit": "~4" + }, + "suggest": { + "ext-mbstring": "For best performance", + "symfony/polyfill-mbstring": "If you can't install ext-mbstring" } } diff --git a/src/Html2Text.php b/src/Html2Text.php index d182c8c..b270239 100644 --- a/src/Html2Text.php +++ b/src/Html2Text.php @@ -23,6 +23,8 @@ class Html2Text { const ENCODING = 'UTF-8'; + protected $htmlFuncFlags; + /** * Contains the HTML content to convert. * @@ -236,6 +238,9 @@ public function __construct($html = '', $options = array()) $this->html = $html; $this->options = array_merge($this->options, $options); + $this->htmlFuncFlags = (PHP_VERSION_ID < 50400) + ? ENT_COMPAT + : ENT_COMPAT | ENT_HTML5; } /** @@ -318,6 +323,16 @@ public function set_base_url($baseurl) } protected function convert() + { + $origEncoding = mb_internal_encoding(); + mb_internal_encoding(self::ENCODING); + + $this->doConvert(); + + mb_internal_encoding($origEncoding); + } + + protected function doConvert() { $this->linkList = array(); @@ -345,7 +360,7 @@ protected function converter(&$text) $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text); $text = strip_tags($text); $text = preg_replace($this->entSearch, $this->entReplace, $text); - $text = html_entity_decode($text, ENT_QUOTES, self::ENCODING); + $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING); // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); @@ -395,7 +410,7 @@ protected function buildlinkList($link, $display, $linkOverride = null) $url = $link; } else { $url = $this->baseurl; - if (substr($link, 0, 1) != '/') { + if (mb_substr($link, 0, 1) != '/') { $url .= '/'; } $url .= $link; @@ -472,7 +487,7 @@ protected function convertBlockquotes(&$text) $end = $m[1]; $len = $end - $taglen - $start; // Get blockquote content - $body = substr($text, $start + $taglen - $diff, $len); + $body = mb_substr($text, $start + $taglen - $diff, $len); // Set text width $pWidth = $this->options['width']; @@ -482,20 +497,21 @@ protected function convertBlockquotes(&$text) $this->converter($body); // Add citation markers and create PRE block $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); - $body = '
' . htmlspecialchars($body) . '
'; + $body = '
' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '
'; // Re-set text width $this->options['width'] = $pWidth; // Replace content - $text = substr($text, 0, $start - $diff) - . $body . substr($text, $end + strlen($m[0]) - $diff); + $text = mb_substr($text, 0, $start - $diff) + . $body + . mb_substr($text, $end + mb_strlen($m[0]) - $diff); - $diff += $len + $taglen + strlen($m[0]) - strlen($body); + $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body); unset($body); } } else { if ($level == 0) { $start = $m[1]; - $taglen = strlen($m[0]); + $taglen = mb_strlen($m[0]); } $level++; } @@ -511,7 +527,7 @@ protected function convertBlockquotes(&$text) */ protected function pregCallback($matches) { - switch (strtolower($matches[1])) { + switch (mb_strtolower($matches[1])) { case 'p': // Replace newlines with spaces. $para = str_replace("\n", " ", $matches[3]); @@ -585,15 +601,9 @@ protected function toupper($str) */ protected function strtoupper($str) { - $str = html_entity_decode($str, ENT_COMPAT, self::ENCODING); - - if (function_exists('mb_strtoupper')) { - $str = mb_strtoupper($str, self::ENCODING); - } else { - $str = strtoupper($str); - } - - $str = htmlspecialchars($str, ENT_COMPAT, self::ENCODING); + $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING); + $str = mb_strtoupper($str); + $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING); return $str; }