Merge pull request #11 from Mange/release-39

Update to version 39 of the CLDR dataset
Mange · May 18, 2021 · 3e32c3c · 3e32c3c
2 parents 187d4f2 + 27c068d
commit 3e32c3c
Show file tree

Hide file tree

Showing 16 changed files with 137 additions and 39 deletions.
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 DATA_FILES := $(patsubst views/%.jq,data/%,$(wildcard views/*.jq)) data/all.json
 INPUT_FILES := $(wildcard cldr/common/annotations/*.xml) \
 	$(wildcard cldr/common/annotationsDerived/*.xml) \
-	cldr/tools/java/org/unicode/cldr/util/data/emoji/emoji-test.txt
+	cldr/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/emoji/emoji-test.txt
 
 all: cldr $(DATA_FILES)
 

diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ as you want; only outdated files will be rebuilt.
 
 Here are the current views, and an example of how they look like.
 
-### `all_emojis.json`
+#### `all_emojis.json`
 
 An JSON array of all emojis as objects with the following keys:
 
@@ -64,7 +64,7 @@ An JSON array of all emojis as objects with the following keys:
 * `qualification` - **String**. Either "fully-qualified", "unqualified", or
   undetermined.
 
-#### Sample
+##### Sample
 
 ```json
 [
@@ -99,7 +99,7 @@ An JSON array of all emojis as objects with the following keys:
 ]
 ```
 
-### `all_emojis.txt`
+#### `all_emojis.txt`
 
 A tab-separated text file with one emoji on each line. Keywords are joined with
 pipes (`|`) in the last column.
@@ -114,18 +114,18 @@ Columns are, in order:
 * `en_keywords` - List of English keywords for the emoji, joined with pipes
   (`|`).
 
-#### Sample
+##### Sample
 
 ```text
 😁    Smileys & People        face-positive   beaming face with smiling eyes	beaming face with smiling eyes | eye | face | grin | smile
 😂    Smileys & People        face-positive   face with tears of joy	face | face with tears of joy | joy | laugh | tear
 ```
 
-### `categories.txt`
+#### `categories.txt`
 
 A list of all categories in the dataset. One category name per line.
 
-#### Sample
+##### Sample
 
 ```text
 Smileys & People
@@ -140,12 +140,12 @@ Smileys & Emotion
 People & Body
 ```
 
-### `subcategories.txt`
+#### `subcategories.txt`
 
 A list of all category-subcategory pair in the dataset. One pair per line,
 separated by a tab character.
 
-#### Sample
+##### Sample
 
 ```text
 Symbols keycap
@@ -166,12 +166,12 @@ People & Body   hand-single-finger
 People & Body   hand-fingers-closed
 ```
 
-### `subcategories_count.txt`
+#### `subcategories_count.txt`
 
 Like `subcategories.txt`, but every line has an additional column with the
 total amount of emojis residing under that category/subcategory pair.
 
-#### Sample
+##### Sample
 
 ```text
 People & Body person-role 488
@@ -183,7 +183,31 @@ People & Body family  77
 People & Body person-symbol 2
 ```
 
-### `all.json`
+#### `versions_count.txt`
+
+Counts the number of recorded Emojis from each version of the Unicode standard.
+
+You can read up on [what the versions mean][emoji-versions] in the [Unicode
+Technical Standard #51][tr51].
+
+##### Sample
+
+```text
+E0.6  793
+E0.7  254
+E1.0  512
+E2.0  297
+E3.0  157
+E4.0  1030
+E5.0  339
+E11.0 188
+E12.0 266
+E12.1 186
+E13.0 146
+E13.1 422
+```
+
+#### `all.json`
 
 Raw data of all emojis used to generate all other data files. Basic structure is this:
 
@@ -199,6 +223,7 @@ Raw data of all emojis used to generate all other data files. Basic structure is
             {
               "characters": "…",
               "name": "name of the emoji",
+              "version": "E1.0",
               "keywords": {
                 "lang1": ["keyword1", "keyword2"],
                 "lang2": ["keyword1", "keyword2"]
@@ -236,3 +261,5 @@ Code in this repo is Copyright © 2019 Magnus Bergmark.
 
 [cldr]: http://cldr.unicode.org/
 [unicode-license]: http://www.unicode.org/copyright.html
+[tr51]: http://www.unicode.org/reports/tr51/
+[emoji-versions]: http://www.unicode.org/reports/tr51/#EmojiVersions
diff --git a/cldr b/cldr
diff --git a/compile.rb b/compile.rb
@@ -11,7 +11,7 @@
 
 $stderr.print "Loading CLDR emoji-test file…"
 compiler.add_test_file(
-  "cldr/tools/java/org/unicode/cldr/util/data/emoji/emoji-test.txt"
+  "cldr/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/emoji/emoji-test.txt"
 )
 warn " Done!"
 

diff --git a/lib/annotation_file.rb b/lib/annotation_file.rb
@@ -20,10 +20,11 @@ def each_annotation
     return to_enum(:each_annotation) unless block_given?
 
     each_annotation_element do |element|
+      characters = element["cp"]
       if element["type"] == "tts"
-        read_tts(element) { |emoji| yield emoji }
+        read_tts(characters, element) { |emoji| yield emoji }
       else
-        read_keywords(element) { |emoji| yield emoji }
+        read_keywords(characters, element) { |emoji| yield emoji }
       end
     end
   end
@@ -37,17 +38,17 @@ def each_annotation_element
     end
   end
 
-  def read_tts(element)
+  def read_tts(characters, element)
     description = element.text.strip
     if description != CLDR_DUMMY_VALUE
       yield Emoji.new(
-        characters: element["cp"],
+        characters: characters,
         tts_descriptions: {language => description}
       )
     end
   end
 
-  def read_keywords(element)
+  def read_keywords(characters, element)
     keywords = element
       .text
       .split("|")
@@ -56,7 +57,7 @@ def read_keywords(element)
 
     unless keywords.empty?
       yield Emoji.new(
-        characters: element["cp"],
+        characters: characters,
         keywords: {language => keywords}
       )
     end

diff --git a/lib/compiler.rb b/lib/compiler.rb
@@ -18,6 +18,12 @@ def add_emoji(emoji)
     end
   end
 
+  def merge_emoji(emoji)
+    if (current = emojis[emoji.characters])
+      current.merge!(emoji)
+    end
+  end
+
   def add_test_file(filename)
     EmojiTestFile.new(filename).each_emoji do |emoji|
       add_emoji(emoji)
@@ -26,7 +32,7 @@ def add_test_file(filename)
 
   def add_annotation_file(filename)
     AnnotationFile.new(filename).each_annotation do |emoji|
-      add_emoji(emoji)
+      merge_emoji(emoji)
     end
   end
 

diff --git a/lib/emoji.rb b/lib/emoji.rb
@@ -2,10 +2,19 @@
 
 class Emoji
   attr_reader :characters
-  attr_accessor :name, :category, :subcategory, :keywords, :tts_descriptions, :qualification
+  attr_accessor(
+    :name,
+    :version,
+    :category,
+    :subcategory,
+    :keywords,
+    :tts_descriptions,
+    :qualification
+  )
 
   def initialize(
     characters:,
+    version: nil,
     name: nil,
     category: nil,
     subcategory: nil,
@@ -14,6 +23,7 @@ def initialize(
     tts_descriptions: {}
   )
     @characters = characters
+    @version = version
     @name = name
     @category = category
     @subcategory = subcategory
@@ -24,6 +34,7 @@ def initialize(
 
   def merge!(other)
     self.name ||= other.name
+    self.version ||= other.version
     self.category ||= other.category
     self.subcategory ||= other.subcategory
     self.qualification ||= other.qualification

diff --git a/lib/emoji_test_file.rb b/lib/emoji_test_file.rb
@@ -54,19 +54,23 @@ def parse_emoji_line(line, current_group, current_subgroup)
         category: current_group,
         subcategory: current_subgroup,
         characters: matches["characters"],
+        version: matches["version"],
         name: matches["name"],
         qualification: matches["qualification"]
       )
     end
+  rescue => error
+    raise "Error while parsing Emoji test data: #{error}\nLine: #{line.inspect}"
   end
 
   MATCHER = /
-    # "1F48B     ; fully-qualified     # 💋 kiss mark"
+    # "1F48B     ; fully-qualified     # 💋 E0.6 kiss mark"
     ^
-    [^;]+;\s                  # "1F48B       ; "
+    [^;:]+;\s                 # "1F48B       ; "
     (?<qualification>[^\s]+)  # "fully-qualified"
     \s+\#\s+                  # "  # "
     (?<characters>[^\s]+)\s+  # "💋 "
+    (?<version>[^\s]+)\s+     # "E0.6 "
     (?<name>.*)$              # "kiss mark"
   /x.freeze
 end
diff --git a/lib/formatter.rb b/lib/formatter.rb
@@ -41,6 +41,7 @@ def format_emojis(emojis)
     emojis.map do |emoji|
       {
         "characters" => emoji.characters,
+        "version" => emoji.version,
         "name" => emoji.name,
         "keywords" => emoji.keywords,
         "tts_descriptions" => emoji.tts_descriptions,

diff --git a/spec/compiler_spec.rb b/spec/compiler_spec.rb
@@ -12,14 +12,25 @@
     compiler = Compiler.new
 
     with_example_files(
+      # Get this test data with:
+      #   grep \
+      #    -E "(1F600|270D FE0F|group: |subgroup: )" \
+      #    cldr/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/emoji/emoji-test.txt
+      #
+      # Remove the lines not needed for the test
       "test_file.txt" => <<~TEXT,
         # group: Smileys & Emotion
         # subgroup: face-smiling
-        1F600                                      ; fully-qualified     # 😀 grinning face
+        1F600                                                  ; fully-qualified     # 😀 E1.0 grinning face
         # group: People & Body
         # subgroup: hand-prop
-        270D FE0F                                  ; fully-qualified     # ✍️ writing hand
+        270D FE0F                                              ; fully-qualified     # ✍️ E0.7 writing hand
       TEXT
+
+      # Get this test data with:
+      #   grep -E '(😀|`)' cldr/common/annotations/en.xml
+      #
+      # Remove the lines not needed for the test
       "en.xml" => <<~XML
         <?xml version="1.0" encoding="UTF-8" ?>
         <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
@@ -29,7 +40,10 @@
             <language type="en"/>
           </identity>
           <annotations>
+            <annotation cp="`">accent | grave | tone</annotation>
+            <annotation cp="`" type="tts">grave accent</annotation>
             <annotation cp="😀">face | grin | grinning face</annotation>
+            <annotation cp="😀" type="tts">grinning face</annotation>
           </annotations>
         </ldml>
       XML
@@ -40,14 +54,20 @@
       expect(compiler.emojis.size).to eq(2)
       expect(compiler.emojis["😀"]).to have_attributes(
         name: "grinning face",
-        keywords: {}
+        keywords: {},
+        tts_descriptions: {}
       )
 
       compiler.add_annotation_file(paths["en.xml"])
+
+      # The "`" character was not added to the set as it was never present in
+      # the set before loading the annotation files.
       expect(compiler.emojis.size).to eq(2)
+
       expect(compiler.emojis["😀"]).to have_attributes(
         name: "grinning face",
-        keywords: {"en" => ["face", "grin", "grinning face"]}
+        keywords: {"en" => ["face", "grin", "grinning face"]},
+        tts_descriptions: {"en" => "grinning face"}
       )
     end
   end

diff --git a/spec/emoji_spec.rb b/spec/emoji_spec.rb
@@ -10,6 +10,7 @@
 
     expect(emoji).to have_attributes(
       characters: "😀",
+      version: nil,
       name: nil,
       category: nil,
       subcategory: nil,
@@ -22,6 +23,7 @@
   it "accepts optional attributes" do
     emoji = Emoji.new(
       characters: "😀",
+      version: "E0.6",
       name: "grinning face",
       category: "Category",
       subcategory: "Subcategory",
@@ -32,6 +34,7 @@
 
     expect(emoji).to have_attributes(
       characters: "😀",
+      version: "E0.6",
       name: "grinning face",
       category: "Category",
       subcategory: "Subcategory",
@@ -55,20 +58,22 @@
 
   describe "#merge!" do
     it "assigns missing fields from the other" do
-      emoji_a = Emoji.new(characters: "A", category: "Letters")
-      emoji_b = Emoji.new(characters: "B", subcategory: "Latin")
+      emoji_a = Emoji.new(characters: "A", category: "Letters", version: "E0.6")
+      emoji_b = Emoji.new(characters: "B", subcategory: "Latin", version: "E1.0")
 
       emoji_a.merge!(emoji_b)
 
       expect(emoji_a).to have_attributes(
         characters: "A",
+        version: "E0.6",
         category: "Letters",
         subcategory: "Latin",
         keywords: {}
       )
 
       expect(emoji_b).to have_attributes(
         characters: "B",
+        version: "E1.0",
         category: nil,
         subcategory: "Latin",
         keywords: {}