From 7a3b9f41ef0fdd6e026262313706dd4bbfef22d5 Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 12 Oct 2023 03:05:02 +0900 Subject: [PATCH] Unicode 15.1 (#32) * change data files to unicode 15.1 * Update build.rs to unicode 15.1 * update docs * Undo newline removal --------- Co-authored-by: Valentin Lorentz --- README.md | 2 +- build.rs | 5 ++++- data/NameAliases.txt | 17 ++++++++++------- data/UnicodeData.txt | 7 +++++++ src/lib.rs | 2 +- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bd1edb9..5977141 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://travis-ci.org/ProgVal/unicode_names2.png)](https://travis-ci.org/ProgVal/unicode_names2) Time and memory efficiently mapping characters to and from their -Unicode 15.0 names, at runtime and compile-time. +Unicode 15.1 names, at runtime and compile-time. ```rust fn main() { diff --git a/build.rs b/build.rs index 8b20d05..e073f22 100644 --- a/build.rs +++ b/build.rs @@ -1,12 +1,15 @@ use std::{env, path::PathBuf}; use unicode_names2_generator as generator; +/// [UnicodeData.txt] contains Unicode Character Data +/// +/// [UnicodeData.txt]:https://www.unicode.org/Public/15.1.0/ucd/UnicodeData.txt const UNICODE_DATA: &str = include_str!("data/UnicodeData.txt"); /// Unicode aliases /// /// [NamesList.txt] contents contains a map of unicode aliases to their corresponding values. /// -/// [NamesList.txt]: https://www.unicode.org/Public/14.0.0/ucd/NameAliases.txt +/// [NamesList.txt]: https://www.unicode.org/Public/15.1.0/ucd/NameAliases.txt const NAME_ALIASES: &str = include_str!("data/NameAliases.txt"); fn main() { diff --git a/data/NameAliases.txt b/data/NameAliases.txt index 8519284..1e6f502 100644 --- a/data/NameAliases.txt +++ b/data/NameAliases.txt @@ -1,10 +1,10 @@ -# NameAliases-14.0.0.txt -# Date: 2020-10-21, 22:28:00 GMT [KW, LI] -# © 2020 Unicode®, Inc. -# For terms of use, see http://www.unicode.org/terms_of_use.html +# NameAliases-15.1.0.txt +# Date: 2023-01-05 +# © 2023 Unicode®, Inc. +# For terms of use, see https://www.unicode.org/terms_of_use.html # # Unicode Character Database -# For documentation, see http://www.unicode.org/reports/tr44/ +# For documentation, see https://www.unicode.org/reports/tr44/ # # This file is a normative contributory data file in the # Unicode Character Database. @@ -40,7 +40,7 @@ # control codes (which for historical reasons have no Unicode character name) # or for format characters. # -# For documentation, see NamesList.html and http://www.unicode.org/reports/tr44/ +# For documentation, see NamesList.html and https://www.unicode.org/reports/tr44/ # # FORMAT # @@ -135,6 +135,7 @@ 0018;CAN;abbreviation 0019;END OF MEDIUM;control 0019;EOM;abbreviation +0019;EM;abbreviation 001A;SUBSTITUTE;control 001A;SUB;abbreviation 001B;ESCAPE;control @@ -163,7 +164,7 @@ # (and corresponding abbreviations) for these code # points are included here because these names leaked # out from the draft documents and were published in -# at least one RFC whose names for code points was +# at least one RFC whose names for code points were # implemented in Perl regex expressions. 0080;PADDING CHARACTER;figment @@ -254,6 +255,7 @@ 01A2;LATIN CAPITAL LETTER GHA;correction 01A3;LATIN SMALL LETTER GHA;correction 034F;CGJ;abbreviation +0616;ARABIC SMALL HIGH LIGATURE ALEF WITH YEH BARREE;correction 061C;ALM;abbreviation 0709;SYRIAC SUBLINEAR COLON SKEWED LEFT;correction 0CDE;KANNADA LETTER LLLA;correction @@ -271,6 +273,7 @@ 180D;FVS3;abbreviation 180E;MVS;abbreviation 180F;FVS4;abbreviation +1BBD;SUNDANESE LETTER ARCHAIC I;correction 200B;ZWSP;abbreviation 200C;ZWNJ;abbreviation 200D;ZWJ;abbreviation diff --git a/data/UnicodeData.txt b/data/UnicodeData.txt index ea963a7..bdcc418 100644 --- a/data/UnicodeData.txt +++ b/data/UnicodeData.txt @@ -11231,6 +11231,10 @@ 2FF9;IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT;So;0;ON;;;;;N;;;;; 2FFA;IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT;So;0;ON;;;;;N;;;;; 2FFB;IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID;So;0;ON;;;;;N;;;;; +2FFC;IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM RIGHT;So;0;ON;;;;;N;;;;; +2FFD;IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER RIGHT;So;0;ON;;;;;N;;;;; +2FFE;IDEOGRAPHIC DESCRIPTION CHARACTER HORIZONTAL REFLECTION;So;0;ON;;;;;N;;;;; +2FFF;IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION;So;0;ON;;;;;N;;;;; 3000;IDEOGRAPHIC SPACE;Zs;0;WS; 0020;;;;N;;;;; 3001;IDEOGRAPHIC COMMA;Po;0;ON;;;;;N;;;;; 3002;IDEOGRAPHIC FULL STOP;Po;0;ON;;;;;N;IDEOGRAPHIC PERIOD;;;; @@ -11705,6 +11709,7 @@ 31E1;CJK STROKE HZZZG;So;0;ON;;;;;N;;;;; 31E2;CJK STROKE PG;So;0;ON;;;;;N;;;;; 31E3;CJK STROKE Q;So;0;ON;;;;;N;;;;; +31EF;IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION;So;0;ON;;;;;N;;;;; 31F0;KATAKANA LETTER SMALL KU;Lo;0;L;;;;;N;;;;; 31F1;KATAKANA LETTER SMALL SI;Lo;0;L;;;;;N;;;;; 31F2;KATAKANA LETTER SMALL SU;Lo;0;L;;;;;N;;;;; @@ -34035,6 +34040,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 2CEA1;;Lo;0;L;;;;;N;;;;; 2CEB0;;Lo;0;L;;;;;N;;;;; 2EBE0;;Lo;0;L;;;;;N;;;;; +2EBF0;;Lo;0;L;;;;;N;;;;; +2EE5D;;Lo;0;L;;;;;N;;;;; 2F800;CJK COMPATIBILITY IDEOGRAPH-2F800;Lo;0;L;4E3D;;;;N;;;;; 2F801;CJK COMPATIBILITY IDEOGRAPH-2F801;Lo;0;L;4E38;;;;N;;;;; 2F802;CJK COMPATIBILITY IDEOGRAPH-2F802;Lo;0;L;4E41;;;;N;;;;; diff --git a/src/lib.rs b/src/lib.rs index af0448b..3075240 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ //! Convert between characters and their standard names. //! //! This crate provides two functions for mapping from a `char` to the -//! name given by the Unicode standard (15.0). There are no runtime +//! name given by the Unicode standard (15.1). There are no runtime //! requirements so this is usable with only `core` (this requires //! specifying the `no_std` cargo feature). The tables are heavily //! compressed, but still large (500KB), and still offer efficient