From 3e8ced09dbd7545ff43719cb18c6374509a05d3e Mon Sep 17 00:00:00 2001 From: Ayesh Karunaratne Date: Wed, 5 Jun 2024 21:48:32 +0700 Subject: [PATCH] [PHP 8.4][Intl] Add `grapheme_str_split` Add a polyfill for the `grapheme_str_split` function added in PHP 8.4. Requires PHP 7.3, because the polyfill is based on `\X` Regex, and it only works properly on PCRE2, which [only comes with PHP 7.3+](https://php.watch/versions/7.3/pcre2). Further, there are some cases that the polyfill cannot split complex characters (such as two consecutive country flag Emojis). This is now fixed in [PCRE2Project/pcre2#410](https://github.com/PCRE2Project/pcre2/issues/410). However, this change will likely only make it to PHP 8.4. References: - [RFC: Grapheme cluster for `str_split` function: `grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) - [PHP.Watch: PHP 8.4: New `grapheme_str_split` function](https://php.watch/versions/8.4/grapheme_str_split) --- README.md | 1 + src/Intl/Grapheme/Grapheme.php | 33 ++++++++++++++++++++++++++++ src/Intl/Grapheme/README.md | 1 + src/Intl/Grapheme/bootstrap.php | 4 ++++ src/Intl/Grapheme/bootstrap73.php | 17 ++++++++++++++ src/Intl/Grapheme/bootstrap80.php | 3 +++ src/Php84/Php84.php | 29 ++++++++++++++++++++++++ src/Php84/README.md | 1 + src/Php84/bootstrap.php | 4 ++++ src/Php84/bootstrap73.php | 21 ++++++++++++++++++ tests/Intl/Grapheme/GraphemeTest.php | 26 ++++++++++++++++++++++ tests/Php84/Php84Test.php | 20 +++++++++++++++++ 12 files changed, 160 insertions(+) create mode 100644 src/Intl/Grapheme/bootstrap73.php create mode 100644 src/Php84/bootstrap73.php diff --git a/README.md b/README.md index 370956e57..dd6ee0d9d 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ Polyfills are provided for: - the `Date*Exception/Error` classes introduced in PHP 8.3; - the `SQLite3Exception` class introduced in PHP 8.3; - the `mb_ucfirst` and `mb_lcfirst` functions introduced in PHP 8.4; +- the `grapheme_str_split` function introduced in PHP 8.4 (requires PHP >= 7.3); It is strongly recommended to upgrade your PHP version and/or install the missing extensions whenever possible. This polyfill should be used only when there is no diff --git a/src/Intl/Grapheme/Grapheme.php b/src/Intl/Grapheme/Grapheme.php index 5373f1685..b28697355 100644 --- a/src/Intl/Grapheme/Grapheme.php +++ b/src/Intl/Grapheme/Grapheme.php @@ -26,6 +26,7 @@ * - grapheme_strrpos - Find position (in grapheme units) of last occurrence of a string * - grapheme_strstr - Returns part of haystack string from the first occurrence of needle to the end of haystack * - grapheme_substr - Return part of a string + * - grapheme_str_split - Splits a string into an array of individual or chunks of graphemes. * * @author Nicolas Grekas * @@ -191,6 +192,38 @@ public static function grapheme_strstr($s, $needle, $beforeNeedle = false) return mb_strstr($s, $needle, $beforeNeedle, 'UTF-8'); } + public static function grapheme_str_split($s, $len = 1) { + if ($len < 0 || $len > 1073741823) { + if (80000 > \PHP_VERSION_ID) { + return false; + } + + throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.'); + } + + if ($s === '') { + return []; + } + + preg_match_all('/\X/u', $s, $matches); + + if (empty($matches[0])) { + return false; + } + + if ($len === 1) { + return $matches[0]; + } + + $chunks = array_chunk($matches[0], $len); + + array_walk($chunks, static function(&$value) { + $value = implode('', $value); + }); + + return $chunks; + } + private static function grapheme_position($s, $needle, $offset, $mode) { $needle = (string) $needle; diff --git a/src/Intl/Grapheme/README.md b/src/Intl/Grapheme/README.md index f55d92c5c..8e936ad7f 100644 --- a/src/Intl/Grapheme/README.md +++ b/src/Intl/Grapheme/README.md @@ -21,6 +21,7 @@ This component provides a partial, native PHP implementation of the - [`grapheme_strstr`](https://php.net/grapheme_strstr): Returns part of haystack string from the first occurrence of needle to the end of haystack - [`grapheme_substr`](https://php.net/grapheme_substr): Return part of a string +- [`grapheme_str_split](https://php.net/grapheme_str_split): Splits a string into an array of individual or chunks of graphemes. More information can be found in the [main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md). diff --git a/src/Intl/Grapheme/bootstrap.php b/src/Intl/Grapheme/bootstrap.php index a9ea03c7e..a53c335f2 100644 --- a/src/Intl/Grapheme/bootstrap.php +++ b/src/Intl/Grapheme/bootstrap.php @@ -56,3 +56,7 @@ function grapheme_strstr($haystack, $needle, $beforeNeedle = false) { return p\G if (!function_exists('grapheme_substr')) { function grapheme_substr($string, $offset, $length = null) { return p\Grapheme::grapheme_substr($string, $offset, $length); } } + +if (\PHP_VERSION_ID >= 70300) { + require __DIR__.'/bootstrap73.php'; +} diff --git a/src/Intl/Grapheme/bootstrap73.php b/src/Intl/Grapheme/bootstrap73.php new file mode 100644 index 000000000..488202afd --- /dev/null +++ b/src/Intl/Grapheme/bootstrap73.php @@ -0,0 +1,17 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\Polyfill\Php84 as p; + +if (!function_exists('grapheme_str_split') && function_exists('grapheme_substr')) { + function grapheme_str_split(string $string, int $length = 1) { return p\Php84::grapheme_str_split($string, $length); } +} + diff --git a/src/Intl/Grapheme/bootstrap80.php b/src/Intl/Grapheme/bootstrap80.php index b8c078677..e746b75e7 100644 --- a/src/Intl/Grapheme/bootstrap80.php +++ b/src/Intl/Grapheme/bootstrap80.php @@ -48,3 +48,6 @@ function grapheme_strstr(?string $haystack, ?string $needle, ?bool $beforeNeedle if (!function_exists('grapheme_substr')) { function grapheme_substr(?string $string, ?int $offset, ?int $length = null): string|false { return p\Grapheme::grapheme_substr((string) $string, (int) $offset, $length); } } +if (!function_exists('grapheme_str_split')) { + function grapheme_str_split(string $string, int $length = 1): array|false { return p\Grapheme::grapheme_str_split($string, $length); } +} diff --git a/src/Php84/Php84.php b/src/Php84/Php84.php index c8a9cf160..92df16a70 100644 --- a/src/Php84/Php84.php +++ b/src/Php84/Php84.php @@ -63,4 +63,33 @@ public static function mb_lcfirst(string $string, ?string $encoding = null): str return $firstChar . mb_substr($string, 1, null, $encoding); } + + public static function grapheme_str_split(string $string, int $length) + { + if ($length < 0 || $length > 1073741823) { + throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.'); + } + + if ($string === '') { + return []; + } + + preg_match_all('/\X/u', $string, $matches); + + if (empty($matches[0])) { + return false; + } + + if ($length === 1) { + return $matches[0]; + } + + $chunks = array_chunk($matches[0], $length); + + array_walk($chunks, static function(&$value) { + $value = implode('', $value); + }); + + return $chunks; + } } diff --git a/src/Php84/README.md b/src/Php84/README.md index 77d249bed..ba13db76b 100644 --- a/src/Php84/README.md +++ b/src/Php84/README.md @@ -4,6 +4,7 @@ Symfony Polyfill / Php84 This component provides features added to PHP 8.4 core: - [`mb_ucfirst` and `mb_lcfirst`](https://wiki.php.net/rfc/mb_ucfirst) +- [`grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) More information can be found in the [main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md). diff --git a/src/Php84/bootstrap.php b/src/Php84/bootstrap.php index f73ba3d42..5f6a0a762 100644 --- a/src/Php84/bootstrap.php +++ b/src/Php84/bootstrap.php @@ -23,3 +23,7 @@ function mb_ucfirst($string, ?string $encoding = null): string { return p\Php84: if (!function_exists('mb_lcfirst')) { function mb_lcfirst($string, ?string $encoding = null): string { return p\Php84::mb_lcfirst($string, $encoding); } } + +if (\PHP_VERSION_ID >= 70300) { + require __DIR__.'/bootstrap73.php'; +} diff --git a/src/Php84/bootstrap73.php b/src/Php84/bootstrap73.php new file mode 100644 index 000000000..f0b3ea3b2 --- /dev/null +++ b/src/Php84/bootstrap73.php @@ -0,0 +1,21 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\Polyfill\Php84 as p; + +if (\PHP_VERSION_ID >= 80400) { + return; +} + +if (!function_exists('grapheme_str_split') && function_exists('grapheme_substr')) { + function grapheme_str_split(string $string, int $length = 1) { return p\Php84::grapheme_str_split($string, $length); } +} + diff --git a/tests/Intl/Grapheme/GraphemeTest.php b/tests/Intl/Grapheme/GraphemeTest.php index befe1e36f..37008e51c 100644 --- a/tests/Intl/Grapheme/GraphemeTest.php +++ b/tests/Intl/Grapheme/GraphemeTest.php @@ -207,4 +207,30 @@ public function testGraphemeStrstr() $this->assertSame('국어', grapheme_strstr('한국어', '국')); $this->assertSame('ÉJÀ', grapheme_stristr('DÉJÀ', 'é')); } + + /** + * @dataProvider graphemeStrSplitDataProvider + * @requires PHP 7.3 + */ + public function testGraphemeStrSplit(string $string, int $length, array $expectedValues) { + $this->assertSame($expectedValues, grapheme_str_split($string, $length)); + } + + public static function graphemeStrSplitDataProvider(): array { + $return = [ + ['', 1, []], + ['PHP', 1, ['P', 'H', 'P']], + ['你好', 1, ['你', '好']], + ['අයේෂ්', 1, ['අ', 'යේ', 'ෂ්']], + ['สวัสดี', 2, ['สวั', 'สดี']], + ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]], + ]; + + // https://github.com/PCRE2Project/pcre2/issues/410 + if (PCRE_VERSION_MAJOR > 10 && PCRE_VERSION_MAJOR >= 44) { + $return[] = ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]]; + } + + return $return; + } } diff --git a/tests/Php84/Php84Test.php b/tests/Php84/Php84Test.php index c66f402df..4c0c6ea1b 100644 --- a/tests/Php84/Php84Test.php +++ b/tests/Php84/Php84Test.php @@ -68,4 +68,24 @@ public static function lcFirstDataProvider(): array { ["ß", "ß"], ]; } + + /** + * @dataProvider graphemeStrSplitDataProvider + * @requires PHP 7.3 + */ + public function testGraphemeStrSplit(string $string, int $length, array $expectedValues) { + $this->assertSame($expectedValues, grapheme_str_split($string, $length)); + } + + public static function graphemeStrSplitDataProvider(): array { + return [ + ['', 1, []], + ['PHP', 1, ['P', 'H', 'P']], + ['你好', 1, ['你', '好']], + ['අයේෂ්', 1, ['අ', 'යේ', 'ෂ්']], + ['สวัสดี', 2, ['สวั', 'สดี']], + ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]], + // ['👭🏻👰🏿‍♂️', 2, ['👭🏻', '👰🏿‍♂️']], // https://github.com/PCRE2Project/pcre2/issues/410 + ]; + } }