Skip to content

Commit

Permalink
Implement full Unicode 16.0.0 extended grapheme breaking. (#719)
Browse files Browse the repository at this point in the history
Implement full Unicode 16.0.0 extended grapheme breaking.

Includes rule GB9c (Indic Conjunt Break based breaking).

This change has a significant cost in size since the
information needed per character no longer fits in 4 bits.
The base table is therefore twice as big (one byte per entry
rather than half of that).

The number of states in the state automatons have also
increased slightly, but in comparison that's a negligible change.

Tests have been made more thorough, testing not only the
Unicode Consortium provided tests, but also variants of those
with representative characters for each category of character
that either in or not-in the BMP, to test that surrogate pair
decoding works correctly.

Test also check that the created automatons are minimal,
in that no state is unreachable and no two states are
indistinguishable.
  • Loading branch information
lrhn authored Nov 20, 2024
1 parent 6af0821 commit 1de8372
Show file tree
Hide file tree
Showing 33 changed files with 23,533 additions and 18,145 deletions.
4 changes: 0 additions & 4 deletions pkgs/characters/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ using a [`CharacterRange`][CharacterRange].

Based on Unicode <!-- unicode-version -->version 16.0.0<!-- /unicode-version -->.

This package is not script-aware, and does not currently support the rule for
Indic Conjunct Breaks introduced in Unicode 15.10.0
([GB9c](https://www.unicode.org/reports/tr29/tr29-43.html#GB9c)).

## Unicode characters and representations

There is no such thing as plain text.
Expand Down
4 changes: 0 additions & 4 deletions pkgs/characters/analysis_options.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1 @@
include: package:dart_flutter_team_lints/analysis_options.yaml

analyzer:
errors:
prefer_single_quotes: ignore
32 changes: 16 additions & 16 deletions pkgs/characters/benchmark/benchmark.dart
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

// Benchmark of efficiency of grapheme cluster operations.

import "package:characters/characters.dart";
import 'package:characters/characters.dart';

import "../test/src/text_samples.dart";
import '../test/src/text_samples.dart';

double bench(int Function() action, int ms) {
var elapsed = 0;
Expand Down Expand Up @@ -49,12 +49,12 @@ int reverseStrings() {
var revHangul = reverse(hangul);
var rev2Hangul = reverse(revHangul);
if (hangul != rev2Hangul || hangul == revHangul) {
throw AssertionError("Bad reverse");
throw AssertionError('Bad reverse');
}
var revGenesis = reverse(genesis);
var rev2Genesis = reverse(revGenesis);
if (genesis != rev2Genesis || genesis == revGenesis) {
throw AssertionError("Bad reverse");
throw AssertionError('Bad reverse');
}

return (hangul.length + genesis.length) * 2;
Expand All @@ -63,16 +63,16 @@ int reverseStrings() {
int replaceStrings() {
var count = 0;
{
const language = "한글";
const language = '한글';
assert(language.length == 6);
var chars = Characters(hangul);
var replaced =
chars.replaceAll(Characters(language), Characters("Hangul!"));
chars.replaceAll(Characters(language), Characters('Hangul!'));
count += replaced.string.length - hangul.length;
}
{
var chars = Characters(genesis);
var replaced = chars.replaceAll(Characters("And"), Characters("Also"));
var replaced = chars.replaceAll(Characters('And'), Characters('Also'));
count += replaced.string.length - genesis.length;
}
return count;
Expand Down Expand Up @@ -111,27 +111,27 @@ void main(List<String> args) {

for (var i = 0; i < count; i++) {
var performance = bench(iterateIndicesOnly, 2000);
print("Index Iteration: ${toDigits(performance)} gc/ms");
print('Index Iteration: ${toDigits(performance)} gc/ms');
if (performance > bestIterateIndices) bestIterateIndices = performance;

performance = bench(iterateStrings, 2000);
print("String Iteration: ${toDigits(performance)} cu/ms");
print('String Iteration: ${toDigits(performance)} cu/ms');
if (performance > bestIterateStrings) bestIterateStrings = performance;

performance = bench(reverseStrings, 2000);
print("String Reversing: ${toDigits(performance)} cu/ms");
print('String Reversing: ${toDigits(performance)} cu/ms');
if (performance > bestReverseStrings) bestReverseStrings = performance;

performance = bench(replaceStrings, 2000);
print("String Replacing: ${toDigits(performance)} changes/ms");
print('String Replacing: ${toDigits(performance)} changes/ms');
if (performance > bestReplaceStrings) bestReplaceStrings = performance;
}

if (count > 1) {
print("Best: ");
print("Index Iteration: ${toDigits(bestIterateIndices)} gc/ms");
print("String Iteration: ${toDigits(bestIterateStrings)} cu/ms");
print("String Reversing: ${toDigits(bestReverseStrings)} cu/ms");
print("String Replacing: ${toDigits(bestReplaceStrings)} changes/ms");
print('Best: ');
print('Index Iteration: ${toDigits(bestIterateIndices)} gc/ms');
print('String Iteration: ${toDigits(bestIterateStrings)} cu/ms');
print('String Reversing: ${toDigits(bestReverseStrings)} cu/ms');
print('String Replacing: ${toDigits(bestReplaceStrings)} changes/ms');
}
}
4 changes: 2 additions & 2 deletions pkgs/characters/lib/characters.dart
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@
/// String operations based on characters (Unicode grapheme clusters).
library;

export "src/characters.dart";
export "src/extensions.dart";
export 'src/characters.dart';
export 'src/extensions.dart';
16 changes: 8 additions & 8 deletions pkgs/characters/lib/src/characters.dart
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

import "characters_impl.dart";
import 'characters_impl.dart';

/// The characters of a string.
///
Expand All @@ -21,7 +21,7 @@ import "characters_impl.dart";
/// in different ways.
abstract class Characters implements Iterable<String> {
/// An empty [Characters] containing no characters.
static const Characters empty = StringCharacters("");
static const Characters empty = StringCharacters('');

/// Creates a [Characters] allowing iteration of
/// the characters of [string].
Expand Down Expand Up @@ -260,9 +260,9 @@ abstract class Characters implements Iterable<String> {
/// Any further occurrences will be included in the last part.
/// Example:
/// ```dart
/// var c = "abracadabra".characters;
/// var parts = c.split("a".characters, 4).toList();
/// print(parts); // Prints is ["", "br", "c", "dabra"]
/// var c = 'abracadabra'.characters;
/// var parts = c.split('a'.characters, 4).toList();
/// print(parts); // Prints is ['', 'br', 'c', 'dabra']
/// ```
/// If there are fewer than `maxParts - 1` occurrences of [pattern],
/// then the characters are split at all occurrences.
Expand Down Expand Up @@ -790,9 +790,9 @@ abstract class CharacterRange implements Iterator<String> {
///
/// Example:
/// ```dart
/// var c = "abracadabra".characters.dropFirst().dropLast();
/// // c is "bracadabr".
/// var parts = c.split("a".characters, 3).toList();
/// var c = 'abracadabra'.characters.dropFirst().dropLast();
/// // c is 'bracadabr'.
/// var parts = c.split('a'.characters, 3).toList();
/// print(parts); // [br, c, dabr]
/// ```
/// If there are fewer than `maxParts - 1` occurrences of [pattern],
Expand Down
54 changes: 27 additions & 27 deletions pkgs/characters/lib/src/characters_impl.dart
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

import "characters.dart";
import "grapheme_clusters/breaks.dart";
import "grapheme_clusters/constants.dart";
import 'characters.dart';
import 'grapheme_clusters/breaks.dart';
import 'grapheme_clusters/constants.dart';
import 'grapheme_clusters/table.dart';

/// The grapheme clusters of a string.
Expand All @@ -28,23 +28,23 @@ final class StringCharacters extends Iterable<String> implements Characters {

@override
String get first => string.isEmpty
? throw StateError("No element")
? throw StateError('No element')
: string.substring(
0, Breaks(string, 0, string.length, stateSoTNoBreak).nextBreak());

@override
String get last => string.isEmpty
? throw StateError("No element")
? throw StateError('No element')
: string.substring(
BackBreaks(string, string.length, 0, stateEoTNoBreak).nextBreak());

@override
String get single {
if (string.isEmpty) throw StateError("No element");
if (string.isEmpty) throw StateError('No element');
var firstEnd =
Breaks(string, 0, string.length, stateSoTNoBreak).nextBreak();
if (firstEnd == string.length) return string;
throw StateError("Too many elements");
throw StateError('Too many elements');
}

@override
Expand Down Expand Up @@ -74,9 +74,9 @@ final class StringCharacters extends Iterable<String> implements Characters {
}

@override
String join([String separator = ""]) {
if (separator == "") return string;
return _explodeReplace(string, 0, string.length, separator, "");
String join([String separator = '']) {
if (separator == '') return string;
return _explodeReplace(string, 0, string.length, separator, '');
}

@override
Expand All @@ -91,12 +91,12 @@ final class StringCharacters extends Iterable<String> implements Characters {
cursor = next;
}
if (orElse != null) return orElse();
throw StateError("No element");
throw StateError('No element');
}

@override
String elementAt(int index) {
RangeError.checkNotNegative(index, "index");
RangeError.checkNotNegative(index, 'index');
var count = 0;
if (string.isNotEmpty) {
var breaks = Breaks(string, 0, string.length, stateSoTNoBreak);
Expand All @@ -108,7 +108,7 @@ final class StringCharacters extends Iterable<String> implements Characters {
start = end;
}
}
throw RangeError.index(index, this, "index", null, count);
throw RangeError.index(index, this, 'index', null, count);
}

@override
Expand Down Expand Up @@ -209,7 +209,7 @@ final class StringCharacters extends Iterable<String> implements Characters {

@override
Characters skip(int count) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
return _skip(count);
}

Expand All @@ -221,7 +221,7 @@ final class StringCharacters extends Iterable<String> implements Characters {

@override
Characters take(int count) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
return _take(count);
}

Expand All @@ -233,9 +233,9 @@ final class StringCharacters extends Iterable<String> implements Characters {

@override
Characters getRange(int start, [int? end]) {
RangeError.checkNotNegative(start, "start");
RangeError.checkNotNegative(start, 'start');
if (end == null) return _skip(start);
if (end < start) throw RangeError.range(end, start, null, "end");
if (end < start) throw RangeError.range(end, start, null, 'end');
if (end == start) return Characters.empty;
if (start == 0) return _take(end);
if (string.isEmpty) return this;
Expand All @@ -254,10 +254,10 @@ final class StringCharacters extends Iterable<String> implements Characters {
while (position > 0) {
position--;
start = breaks.nextBreak();
if (start < 0) throw StateError("No element");
if (start < 0) throw StateError('No element');
}
var end = breaks.nextBreak();
if (end < 0) throw StateError("No element");
if (end < 0) throw StateError('No element');
if (start == 0 && end == string.length) return this;
return StringCharacters(string.substring(start, end));
}
Expand Down Expand Up @@ -311,7 +311,7 @@ final class StringCharacters extends Iterable<String> implements Characters {

@override
Characters skipLast(int count) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
if (count == 0) return this;
if (string.isNotEmpty) {
var breaks = BackBreaks(string, string.length, 0, stateEoTNoBreak);
Expand Down Expand Up @@ -351,7 +351,7 @@ final class StringCharacters extends Iterable<String> implements Characters {

@override
Characters takeLast(int count) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
if (count == 0) return Characters.empty;
if (string.isNotEmpty) {
var breaks = BackBreaks(string, string.length, 0, stateEoTNoBreak);
Expand Down Expand Up @@ -446,7 +446,7 @@ class StringCharacterRange implements CharacterRange {
factory StringCharacterRange.at(String string, int startIndex,
[int? endIndex]) {
RangeError.checkValidRange(
startIndex, endIndex, string.length, "startIndex", "endIndex");
startIndex, endIndex, string.length, 'startIndex', 'endIndex');
return _expandRange(string, startIndex, endIndex ?? startIndex);
}

Expand Down Expand Up @@ -501,7 +501,7 @@ class StringCharacterRange implements CharacterRange {
}
}
state = move(state, category);
if (state & stateNoBreak == 0 && --count == 0) {
if (state & maskBreak != flagNoBreak && --count == 0) {
_move(newStart, index);
return true;
}
Expand All @@ -513,7 +513,7 @@ class StringCharacterRange implements CharacterRange {
_move(newStart, _end);
return true;
} else {
throw RangeError.range(count, 0, null, "count");
throw RangeError.range(count, 0, null, 'count');
}
}

Expand All @@ -530,7 +530,7 @@ class StringCharacterRange implements CharacterRange {
bool moveBack([int count = 1]) => _retractStart(count, _start);

bool _retractStart(int count, int newEnd) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
var breaks = _backBreaksFromStart();
var start = _start;
while (count > 0) {
Expand Down Expand Up @@ -578,7 +578,7 @@ class StringCharacterRange implements CharacterRange {

@override
bool dropFirst([int count = 1]) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
if (_start == _end) return count == 0;
var breaks = Breaks(_string, _start, _end, stateSoTNoBreak);
while (count > 0) {
Expand Down Expand Up @@ -636,7 +636,7 @@ class StringCharacterRange implements CharacterRange {

@override
bool dropLast([int count = 1]) {
RangeError.checkNotNegative(count, "count");
RangeError.checkNotNegative(count, 'count');
var breaks = BackBreaks(_string, _end, _start, stateEoTNoBreak);
while (count > 0) {
var nextBreak = breaks.nextBreak();
Expand Down
Loading

0 comments on commit 1de8372

Please sign in to comment.