From bf4a1a7bf060edc37054a9868610c74d5f1f1a36 Mon Sep 17 00:00:00 2001 From: Ayesh Karunaratne Date: Wed, 5 Jun 2024 21:48:32 +0700 Subject: [PATCH] [PHP 8.4][Intl] Add `grapheme_str_split` Add a polyfill for the `grapheme_str_split` function added in PHP 8.4. Requires PHP 7.3, because the polyfill is based on `\X` Regex, and it only works properly on PCRE2, which [only comes with PHP 7.3+](https://php.watch/versions/7.3/pcre2). Further, there are some cases that the polyfill cannot split complex characters (such as two consecutive country flag Emojis). This is now fixed in [PCRE2Project/pcre2#410](https://github.com/PCRE2Project/pcre2/issues/410). However, this change will likely only make it to PHP 8.4. References: - [RFC: Grapheme cluster for `str_split` function: `grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) - [PHP.Watch: PHP 8.4: New `grapheme_str_split` function](https://php.watch/versions/8.4/grapheme_str_split) --- README.md | 1 + src/Intl/Grapheme/Grapheme.php | 32 ++++++++++++++++++++++++++++ src/Intl/Grapheme/README.md | 1 + src/Intl/Grapheme/bootstrap80.php | 3 +++ src/Php84/Php84.php | 30 ++++++++++++++++++++++++++ src/Php84/README.md | 1 + src/Php84/bootstrap.php | 4 ++++ tests/Intl/Grapheme/GraphemeTest.php | 27 +++++++++++++++++++++++ tests/Php84/Php84Test.php | 21 ++++++++++++++++++ 9 files changed, 120 insertions(+) diff --git a/README.md b/README.md index 7517fba8c..be11170e3 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ Polyfills are provided for: - the `Deprecated` attribute introduced in PHP 8.4; - the `mb_trim`, `mb_ltrim` and `mb_rtrim` functions introduced in PHP 8.4; - the `CURL_HTTP_VERSION_3` and `CURL_HTTP_VERSION_3ONLY` constants introduced in PHP 8.4; +- the `grapheme_str_split` function introduced in PHP 8.4; It is strongly recommended to upgrade your PHP version and/or install the missing extensions whenever possible. This polyfill should be used only when there is no diff --git a/src/Intl/Grapheme/Grapheme.php b/src/Intl/Grapheme/Grapheme.php index 5373f1685..f48c79379 100644 --- a/src/Intl/Grapheme/Grapheme.php +++ b/src/Intl/Grapheme/Grapheme.php @@ -26,6 +26,7 @@ * - grapheme_strrpos - Find position (in grapheme units) of last occurrence of a string * - grapheme_strstr - Returns part of haystack string from the first occurrence of needle to the end of haystack * - grapheme_substr - Return part of a string + * - grapheme_str_split - Splits a string into an array of individual or chunks of graphemes. * * @author Nicolas Grekas * @@ -191,6 +192,37 @@ public static function grapheme_strstr($s, $needle, $beforeNeedle = false) return mb_strstr($s, $needle, $beforeNeedle, 'UTF-8'); } + public static function grapheme_str_split($s, $len = 1) + { + if (0 > $len || 1073741823 < $len) { + if (80000 > \PHP_VERSION_ID) { + return false; + } + + throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.'); + } + + if ('' === $s) { + return []; + } + + if (!preg_match_all('/('.SYMFONY_GRAPHEME_CLUSTER_RX.')/u', $s, $matches)) { + return false; + } + + if (1 === $len) { + return $matches[0]; + } + + $chunks = array_chunk($matches[0], $len); + + foreach ($chunks as &$chunk) { + $chunk = implode('', $chunk); + } + + return $chunks; + } + private static function grapheme_position($s, $needle, $offset, $mode) { $needle = (string) $needle; diff --git a/src/Intl/Grapheme/README.md b/src/Intl/Grapheme/README.md index f55d92c5c..8e936ad7f 100644 --- a/src/Intl/Grapheme/README.md +++ b/src/Intl/Grapheme/README.md @@ -21,6 +21,7 @@ This component provides a partial, native PHP implementation of the - [`grapheme_strstr`](https://php.net/grapheme_strstr): Returns part of haystack string from the first occurrence of needle to the end of haystack - [`grapheme_substr`](https://php.net/grapheme_substr): Return part of a string +- [`grapheme_str_split](https://php.net/grapheme_str_split): Splits a string into an array of individual or chunks of graphemes. More information can be found in the [main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md). diff --git a/src/Intl/Grapheme/bootstrap80.php b/src/Intl/Grapheme/bootstrap80.php index b8c078677..e746b75e7 100644 --- a/src/Intl/Grapheme/bootstrap80.php +++ b/src/Intl/Grapheme/bootstrap80.php @@ -48,3 +48,6 @@ function grapheme_strstr(?string $haystack, ?string $needle, ?bool $beforeNeedle if (!function_exists('grapheme_substr')) { function grapheme_substr(?string $string, ?int $offset, ?int $length = null): string|false { return p\Grapheme::grapheme_substr((string) $string, (int) $offset, $length); } } +if (!function_exists('grapheme_str_split')) { + function grapheme_str_split(string $string, int $length = 1): array|false { return p\Grapheme::grapheme_str_split($string, $length); } +} diff --git a/src/Php84/Php84.php b/src/Php84/Php84.php index 1bca70b56..3ae06d2bd 100644 --- a/src/Php84/Php84.php +++ b/src/Php84/Php84.php @@ -169,4 +169,34 @@ private static function mb_internal_trim(string $regex, string $string, ?string return mb_convert_encoding($string, $encoding, 'UTF-8'); } + + public static function grapheme_str_split(string $string, int $length) + { + if (0 > $length || 1073741823 < $length) { + throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.'); + } + + if ('' === $string) { + return []; + } + + $regex = ((float) \PCRE_VERSION < 10 ? (float) \PCRE_VERSION >= 8.32 : (float) \PCRE_VERSION >= 10.39) + ? '\X' + : '(?:\r\n|(?:[ -~\x{200C}\x{200D}]|[ᆨ-ᇹ]+|[ᄀ-ᅟ]*(?:[가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히]?[ᅠ-ᆢ]+|[가-힣])[ᆨ-ᇹ]*|[ᄀ-ᅟ]+|[^\p{Cc}\p{Cf}\p{Zl}\p{Zp}])[\p{Mn}\p{Me}\x{09BE}\x{09D7}\x{0B3E}\x{0B57}\x{0BBE}\x{0BD7}\x{0CC2}\x{0CD5}\x{0CD6}\x{0D3E}\x{0D57}\x{0DCF}\x{0DDF}\x{200C}\x{200D}\x{1D165}\x{1D16E}-\x{1D172}]*|[\p{Cc}\p{Cf}\p{Zl}\p{Zp}])'; + + if (!preg_match_all('/'. $regex .'/u', $string, $matches)) { + return false; + } + + if (1 === $length) { + return $matches[0]; + } + + $chunks = array_chunk($matches[0], $length); + foreach ($chunks as &$chunk) { + $chunk = implode('', $chunk); + } + + return $chunks; + } } diff --git a/src/Php84/README.md b/src/Php84/README.md index bd56cdc95..3cabb66a2 100644 --- a/src/Php84/README.md +++ b/src/Php84/README.md @@ -7,6 +7,7 @@ This component provides features added to PHP 8.4 core: - [`array_find`, `array_find_key`, `array_any` and `array_all`](https://wiki.php.net/rfc/array_find) - [`Deprecated`](https://wiki.php.net/rfc/deprecated_attribute) - `CURL_HTTP_VERSION_3` and `CURL_HTTP_VERSION_3ONLY` constants +- [`grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) More information can be found in the [main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md). diff --git a/src/Php84/bootstrap.php b/src/Php84/bootstrap.php index 73fb1f404..082df306c 100644 --- a/src/Php84/bootstrap.php +++ b/src/Php84/bootstrap.php @@ -60,3 +60,7 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string { return p\Php84::mb_rtrim($string, $characters, $encoding); } } } + +if (!function_exists('grapheme_str_split') && function_exists('grapheme_substr')) { + function grapheme_str_split(string $string, int $length = 1) { return p\Php84::grapheme_str_split($string, $length); } +} diff --git a/tests/Intl/Grapheme/GraphemeTest.php b/tests/Intl/Grapheme/GraphemeTest.php index 7bd77241f..99d4874f2 100644 --- a/tests/Intl/Grapheme/GraphemeTest.php +++ b/tests/Intl/Grapheme/GraphemeTest.php @@ -209,4 +209,31 @@ public function testGraphemeStrstr() $this->assertSame('국어', grapheme_strstr('한국어', '국')); $this->assertSame('ÉJÀ', grapheme_stristr('DÉJÀ', 'é')); } + + /** + * @dataProvider graphemeStrSplitDataProvider + */ + public function testGraphemeStrSplit(string $string, int $length, array $expectedValues) + { + $this->assertSame($expectedValues, grapheme_str_split($string, $length)); + } + + public static function graphemeStrSplitDataProvider(): array + { + $return = [ + ['', 1, []], + ['PHP', 1, ['P', 'H', 'P']], + ['你好', 1, ['你', '好']], + ['අයේෂ්', 1, ['අ', 'යේ', 'ෂ්']], + ['สวัสดี', 2, ['สวั', 'สดี']], + ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]], + ]; + + // https://github.com/PCRE2Project/pcre2/issues/410 + if (PCRE_VERSION_MAJOR > 10 && PCRE_VERSION_MAJOR >= 44) { + $return[] = ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]]; + } + + return $return; + } } diff --git a/tests/Php84/Php84Test.php b/tests/Php84/Php84Test.php index 9d573a186..a05a2d9fc 100644 --- a/tests/Php84/Php84Test.php +++ b/tests/Php84/Php84Test.php @@ -319,4 +319,25 @@ public static function mbRTrimProvider(): iterable yield ["foo\n", "foo\n", 'o']; } + + /** + * @dataProvider graphemeStrSplitDataProvider + */ + public function testGraphemeStrSplit(string $string, int $length, array $expectedValues) + { + $this->assertSame($expectedValues, grapheme_str_split($string, $length)); + } + + public static function graphemeStrSplitDataProvider(): array + { + return [ + ['', 1, []], + ['PHP', 1, ['P', 'H', 'P']], + ['你好', 1, ['你', '好']], + ['අයේෂ්', 1, ['අ', 'යේ', 'ෂ්']], + ['สวัสดี', 2, ['สวั', 'สดี']], + ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]], + // ['👭🏻👰🏿‍♂️', 2, ['👭🏻', '👰🏿‍♂️']], // https://github.com/PCRE2Project/pcre2/issues/410 + ]; + } }