Reconsider Shift-JIS-based normalizing

On Windows Mozc has had several nomarization rules to avoid possible compatibility and interoperability reasons. With this CL, we basically remove those special rules except for the following two normalization rules: - 0x301C WAVE DASH -> 0xFF5E FULLWIDTH TILDE - 0x2212 MINUS SIGN -> 0xFF0D FULLWIDTH HYPHEN MINUS Here are some examples of behavior changes. Case A: 1. Type "えん" 2. Hit space key to trigger conversion. 3. Choose "¥ [半] 円記号 <機種依存文字>" Current behavior: U+005C is always committed. New behavior: U+00A5 is committed if it's U+00A5 (Compatible with MS-IME on Windows 10) Case B: 1. Type "U+00A5" 2. Hit space key to trigger conversion. 3. Choose "¥ [半] 円記号 <機種依存文字>" Current behavior: U+005C is always committed. New behavior: U+00A5 is committed. Case C: 1. Type "たてぼう" 2. Hit space key to trigger conversion. 3. Choose "‖ [全] 縦線" Current behavior: U+2225 is always committed. (MS-IME on Windows 10 only shows this) New behavior: Both U+2016 and U+2225 can be committed. Case D: 1. Type "うえ" 2. Hit space key to trigger conversion. 3. Choose "‾ [全] オーバライン" Current behavior: U+007E is always committed. New behavior: U+203E is committed. (Compatible with MS-IME on Windows 10 if you choose "‾ [環境依存]") Note that MS-IME on Windows 10 also converts "おーばーらいん" to "‾". Case E: 1. Select "¢" (U+00A2) 2. Hit 変換 key to trigger reconversion. 3. Choose the top candidate. Current behavior: U+FFE0 is always committed. New behavior: U+00A2 is committed. (Compatible with MS-IME behavior) Case F: 1. Select "£" (U+00A3) 2. Hit 変換 key to trigger reconversion. 3. Choose the top candidate. Current behavior: U+FFE2 is always committed. New behavior: U+00A3 is committed. (Compatible with MS-IME behavior) Case G: 1. Select "¬" (U+00AC) 2. Hit 変換 key to trigger reconversion. 3. Choose the top candidate. Current behavior: U+FFE2 is always committed. New behavior: U+00AC is committed. (Compatible with MS-IME behavior) Case H: 1. Type "vaiorin" in Romaji mode Current behavior: "ヴぁいおりん" (Compatible with MS-IME behavior on Windows 10) New behavior: "ゔぁいおりん" BUG= TEST= REF_BUG=26674144 REF_CL=113128015,113129243,113232446 REF_TIME=2016-01-27T14:50:45+09:00 REF_TIME_RAW=1453873845 +0900
google · Jan 27, 2016 · 26241b0 · 26241b0
1 parent 1a6e539
commit 26241b0
Show file tree

Hide file tree

Showing 15 changed files with 90 additions and 258 deletions.
diff --git a/src/base/text_normalizer.cc b/src/base/text_normalizer.cc
@@ -36,6 +36,7 @@
 namespace mozc {
 namespace {
 
+#ifdef OS_WIN
 // Unicode vender specific character table:
 // http://hp.vector.co.jp/authors/VA010341/unicode/
 // http://www.notoinsatu.co.jp/font/omake/OTF_other.pdf
@@ -45,75 +46,35 @@ namespace {
 // Windows CP932 (shift-jis) maps WAVE_DASH to FULL_WIDTH_TILDA.
 // Since the font of WAVE-DASH is ugly on Windows, here we convert WAVE-DHASH to
 // FULL_WIDTH_TILDA as CP932 does.
-#ifdef OS_WIN
-inline char32 ConvertVenderSpecificCharacter(char32 c) {
+//
+// As Unicode has became the defact default encoding.  We have reduced
+// the number of characters to be normalized.
+inline char32 NormalizeCharForWindows(char32 c) {
   switch (c) {
-    case 0x00A5:   // YEN SIGN
-      return 0x005C;   // REVERSE SOLIDUS
-      break;
-    case 0x203E:   // OVERLINE
-      return 0x007E;  // TILDE
-      break;
     case 0x301C:  // WAVE DASH
       return 0xFF5E;   // FULLWIDTH TILDE
       break;
-    case 0x2016:   // DOUBLE VERTICAL LINE
-      return 0x2225;   // PARALLEL TO
-      break;
     case 0x2212:  // MINUS SIGN
       return 0xFF0D;   // FULLWIDTH HYPHEN MINUS
       break;
-    case 0x00A2:   // CENT SIGN
-      return 0xFFE0;    // FULLWIDTH CENT SIGN
-      break;
-    case 0x00A3:   // POUND SIGN
-      return 0xFFE1;   // FULLWIDTH POUND SIGN
-      break;
-    case 0x00AC:   // NOT SIGN
-      return 0xFFE2;   // FULLWIDTH NOT SIGN
-      break;
     default:
       return c;
       break;
   }
 }
+#endif  // OS_WIN
 
-#else   // MAC & Linux
-inline char32 ConvertVenderSpecificCharacter(char32 c) {
-  return c;
-}
-#endif
+}  // namespace
 
-void ConvertVenderSpecificString(StringPiece input, string *output) {
+void TextNormalizer::NormalizeText(StringPiece input, string *output) {
+#ifdef OS_WIN
   output->clear();
   for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) {
-    Util::UCS4ToUTF8Append(ConvertVenderSpecificCharacter(iter.Get()), output);
+    Util::UCS4ToUTF8Append(NormalizeCharForWindows(iter.Get()), output);
   }
-}
-
-}  // namespace
-
-void TextNormalizer::NormalizePreeditText(StringPiece input, string *output) {
-  string tmp;
-  // This is a workaround for hiragana v'
-  //  Util::StringReplace(input, "ゔ", "ヴ", true, &tmp);
-  Util::StringReplace(input, "\xE3\x82\x94", "\xE3\x83\xB4", true, &tmp);
-  ConvertVenderSpecificString(tmp, output);
-}
-
-void TextNormalizer::NormalizeTransliterationText(StringPiece input,
-                                                  string *output) {
-  // Do the same thing with NormalizePreeditText at this morment.
-  NormalizePreeditText(input, output);
-}
-
-void TextNormalizer::NormalizeConversionText(StringPiece input,
-                                             string *output) {
-  ConvertVenderSpecificString(input, output);
-}
-
-void TextNormalizer::NormalizeCandidateText(StringPiece input, string *output) {
-  ConvertVenderSpecificString(input, output);
+#else
+  input.CopyToString(output);
+#endif
 }
 
 }  // namespace mozc
diff --git a/src/base/text_normalizer.h b/src/base/text_normalizer.h
@@ -41,10 +41,7 @@ namespace mozc {
 
 class TextNormalizer {
  public:
-  static void NormalizePreeditText(StringPiece input, string *output);
-  static void NormalizeTransliterationText(StringPiece input, string *output);
-  static void NormalizeConversionText(StringPiece input, string *output);
-  static void NormalizeCandidateText(StringPiece input, string *output);
+  static void NormalizeText(StringPiece input, string *output);
 
  private:
   DISALLOW_IMPLICIT_CONSTRUCTORS(TextNormalizer);

diff --git a/src/base/text_normalizer_test.cc b/src/base/text_normalizer_test.cc
@@ -36,84 +36,52 @@
 
 namespace mozc {
 
-TEST(TextNormalizerTest, NormalizePreeditText) {
+TEST(TextNormalizerTest, NormalizeText) {
   string output;
   // "めかぶ"
-  TextNormalizer::NormalizePreeditText(
-      "\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", &output);
+  TextNormalizer::NormalizeText("\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6",
+                                &output);
   // "めかぶ"
   EXPECT_EQ("\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", output);
 
   // "ゔぁいおりん"
-  TextNormalizer::NormalizePreeditText("\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
-                                          "\xe3\x81\x8a\xe3\x82\x8a\xe3\x82"
-                                          "\x93", &output);
-  // "ヴぁいおりん"
-  EXPECT_EQ("\xe3\x83\xb4\xe3\x81\x81\xe3\x81\x84\xe3\x81\x8a\xe3\x82\x8a\xe3"
-            "\x82\x93", output);
+  TextNormalizer::NormalizeText("\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
+                                "\xe3\x81\x8a\xe3\x82\x8a\xe3\x82\x93",
+                                &output);
+  // "ゔぁいおりん"
+  EXPECT_EQ("\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
+            "\xe3\x81\x8a\xe3\x82\x8a\xe3\x82\x93",
+            output);
 
   // "ぐ〜ぐる"
-  TextNormalizer::NormalizePreeditText("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90"
-                                          "\xe3\x82\x8b", &output);
+  TextNormalizer::NormalizeText("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90"
+                                "\xe3\x82\x8b", &output);
 #ifdef OS_WIN
   // "ぐ～ぐる"
   EXPECT_EQ("\xe3\x81\x90\xef\xbd\x9e\xe3\x81\x90\xe3\x82\x8b", output);
 #else
   // "ぐ〜ぐる"
   EXPECT_EQ("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90\xe3\x82\x8b", output);
 #endif
-}
 
-TEST(TextNormalizerTest, NormalizeTransliterationText) {
-  string output;
-  // "めかぶ"
-  TextNormalizer::NormalizeTransliterationText(
-      "\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", &output);
-  // "めかぶ"
-  EXPECT_EQ("\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", output);
-
-  // "ゔぁいおりん"
-  TextNormalizer::NormalizeTransliterationText(
-      "\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
-      "\xe3\x81\x8a\xe3\x82\x8a\xe3\x82\x93",
+  // "１−２−３": "−" is U+2212
+  TextNormalizer::NormalizeText(
+      "\xEF\xBC\x91\xE2\x88\x92\xEF\xBC\x92\xE2\x88\x92\xEF\xBC\x93",
       &output);
-  // "ヴぁいおりん"
-  EXPECT_EQ("\xe3\x83\xb4\xe3\x81\x81\xe3\x81\x84\xe3\x81\x8a\xe3\x82\x8a\xe3"
-            "\x82\x93", output);
-
-  // "ぐ〜ぐる"
-  TextNormalizer::NormalizeTransliterationText(
-      "\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90\xe3\x82\x8b", &output);
 #ifdef OS_WIN
-  // "ぐ～ぐる"
-  EXPECT_EQ("\xe3\x81\x90\xef\xbd\x9e\xe3\x81\x90\xe3\x82\x8b", output);
+  // "１－２－３": "－" is U+FF0D
+  EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x8D\xEF\xBC\x92\xEF\xBC\x8D\xEF\xBC\x93",
+            output);
 #else
-  // "ぐ〜ぐる"
-  EXPECT_EQ("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90\xe3\x82\x8b", output);
+  // "１−２−３": "−" is U+2212
+  EXPECT_EQ("\xEF\xBC\x91\xE2\x88\x92\xEF\xBC\x92\xE2\x88\x92\xEF\xBC\x93",
+            output);
 #endif
-}
 
-TEST(TextNormalizerTest, NormalizeCandidateText) {
-#ifdef OS_WIN
-  string output;
-  // "ぐ〜ぐる"
-  TextNormalizer::NormalizeCandidateText("\xe3\x81\x90\xe3\x80\x9c\xe3\x81"
-                                            "\x90\xe3\x82\x8b", &output);
-  // "ぐ～ぐる"
-  EXPECT_EQ("\xe3\x81\x90\xef\xbd\x9e\xe3\x81\x90\xe3\x82\x8b", output);
-  // "−"
-  TextNormalizer::NormalizeCandidateText("\xe2\x88\x92", &output);
-  // "－"
-  EXPECT_EQ("\xef\xbc\x8d", output);
-  // "¢"
-  TextNormalizer::NormalizeCandidateText("\xc2\xa2", &output);
-  // "￠"
-  EXPECT_EQ("\xef\xbf\xa0", output);
-  // "‖"
-  TextNormalizer::NormalizeCandidateText("\xe2\x80\x96", &output);
-  // "∥"
-  EXPECT_EQ("\xe2\x88\xa5", output);
-#endif
+  // "¥298": "¥" is U+00A5
+  TextNormalizer::NormalizeText("\xC2\xA5\x32\x39\x38", &output);
+  // U+00A5 is no longer normalized.
+  EXPECT_EQ("\xC2\xA5\x32\x39\x38", output);
 }
 
 }  // namespace mozc
diff --git a/src/converter/quality_regression_util.cc b/src/converter/quality_regression_util.cc
@@ -132,7 +132,7 @@ bool QualityRegressionUtil::TestItem::ParseFromTSV(const string &line) {
   }
   tokens[0].CopyToString(&label);
   tokens[1].CopyToString(&key);
-  TextNormalizer::NormalizeCandidateText(tokens[2], &expected_value);
+  TextNormalizer::NormalizeText(tokens[2], &expected_value);
   tokens[3].CopyToString(&command);
   expected_rank  = NumberUtil::SimpleAtoi(tokens[4]);
   NumberUtil::SafeStrToDouble(tokens[5], &accuracy);

diff --git a/src/data/symbol/symbol.tsv b/src/data/symbol/symbol.tsv
@@ -38,6 +38,7 @@ POS	CHAR	Reading (space separated)	description	additional description	category	m
 記号	／	すらっしゅ しゃせん ・ / きごう ななめ	スラッシュ		GENERAL
 記号	＼	ばっくすらっしゅ ぎゃくしゃせん ・ / \ きごう しゃせん すらっしゅ ななめ	バックスラッシュ		GENERAL
 記号	〜	にょろ なみ から ー - ~ きごう より	波ダッシュ		GENERAL
+記号	∥	| || たてぼう たてせん きごう	縦線		SYMBOL
 記号	‖	| || たてぼう たてせん きごう	縦線		SYMBOL
 記号	｜	| たてぼう たてせん きごう	縦線		GENERAL
 句読点	…	。。。 .  ... ・・・  ・ てん きごう あまり さんてん さんてんりーだ	三点リーダ		SYMBOL OTHER

diff --git a/src/data/test/session/scenario/twelvekeys_toggle_hiragana_preedit_scenario.txt b/src/data/test/session/scenario/twelvekeys_toggle_hiragana_preedit_scenario.txt
@@ -167,13 +167,13 @@ EXPECT_PREEDIT	う
 SEND_KEYS	*
 EXPECT_PREEDIT	ぅ
 SEND_KEYS	*
-EXPECT_PREEDIT	ヴ
+EXPECT_PREEDIT	ゔ
 SEND_KEYS	*
 EXPECT_PREEDIT	う
 SEND_KEYS	*
 EXPECT_PREEDIT	ぅ
 SEND_KEYS	*
-EXPECT_PREEDIT	ヴ
+EXPECT_PREEDIT	ゔ
 SEND_KEYS	*
 EXPECT_PREEDIT	う
 SEND_KEYS	1
@@ -187,9 +187,9 @@ EXPECT_PREEDIT	ぅあ
 
 RESET_CONTEXT
 SEND_KEYS	111**
-EXPECT_PREEDIT	ヴ
+EXPECT_PREEDIT	ゔ
 SEND_KEYS	1
-EXPECT_PREEDIT	ヴあ
+EXPECT_PREEDIT	ゔあ
 
 RESET_CONTEXT
 SEND_KEYS	1111

diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt
@@ -1,6 +1,6 @@
 MAJOR=2
 MINOR=17
-BUILD=2422
+BUILD=2423
 REVISION=102
 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
 # downloaded by NaCl Mozc.

diff --git a/src/rewriter/normalization_rewriter.cc b/src/rewriter/normalization_rewriter.cc
@@ -55,15 +55,10 @@ bool NormalizeCandidate(Segment::Candidate *candidate,
 
   string value, content_value;
   switch (type) {
-    case CANDIDATE:
-      TextNormalizer::NormalizeCandidateText(candidate->value, &value);
-      TextNormalizer::NormalizeCandidateText(candidate->content_value,
-                                             &content_value);
-      break;
+    case CANDIDATE:  // Go through to TRANSLITERATION
     case TRANSLITERATION:
-      TextNormalizer::NormalizeTransliterationText(candidate->value, &value);
-      TextNormalizer::NormalizeTransliterationText(candidate->content_value,
-                                                   &content_value);
+      TextNormalizer::NormalizeText(candidate->value, &value);
+      TextNormalizer::NormalizeText(candidate->content_value, &content_value);
       break;
     default:
       LOG(ERROR) << "unkown type";

diff --git a/src/rewriter/transliteration_rewriter.cc b/src/rewriter/transliteration_rewriter.cc
@@ -86,8 +86,7 @@ void NormalizeT13ns(vector<string> *t13ns) {
   string normalized;
   for (size_t i = 0; i < t13ns->size(); ++i) {
     normalized.clear();
-    TextNormalizer::NormalizeTransliterationText(
-        t13ns->at(i), &normalized);
+    TextNormalizer::NormalizeText(t13ns->at(i), &normalized);
     t13ns->at(i) = normalized;
   }
 }

diff --git a/src/rewriter/transliteration_rewriter_test.cc b/src/rewriter/transliteration_rewriter_test.cc
@@ -598,35 +598,6 @@ TEST_F(TransliterationRewriterTest, NoRewriteTest) {
   EXPECT_EQ(0, segments.conversion_segment(0).meta_candidates_size());
 }
 
-TEST_F(TransliterationRewriterTest, NormalizedTransliterations) {
-  std::unique_ptr<TransliterationRewriter> t13n_rewriter(
-      CreateTransliterationRewriter());
-
-  composer::Table table;
-  table.InitializeWithRequestAndConfig(default_request(), default_config());
-  composer::Composer composer(&table, &default_request(), &default_config());
-
-  // "らゔ"
-  composer.InsertCharacterPreedit("\xE3\x82\x89\xE3\x82\x94");
-
-  Segments segments;
-  {  // Initialize segments.
-    Segment *segment = segments.add_segment();
-    CHECK(segment);
-    // "らゔ"
-    segment->set_key("\xE3\x82\x89\xE3\x82\x94");
-    segment->add_candidate()->value = "LOVE";
-  }
-
-  ConversionRequest request(&composer, &default_request(), &default_config());
-  EXPECT_TRUE(t13n_rewriter->Rewrite(request, &segments));
-  EXPECT_EQ(1, segments.segments_size());
-  const Segment &seg = segments.segment(0);
-  // "らヴ"
-  EXPECT_EQ("\xE3\x82\x89\xE3\x83\xB4",
-            seg.meta_candidate(transliteration::HIRAGANA).value);
-}
-
 TEST_F(TransliterationRewriterTest, MobileEnvironmentTest) {
   ConversionRequest convreq;
   commands::Request request;

diff --git a/src/session/internal/session_output.cc b/src/session/internal/session_output.cc
@@ -380,11 +380,11 @@ bool SessionOutput::AddSegment(const string &key,
                                commands::Preedit *preedit) {
   // Key is always normalized as a preedit text.
   string normalized_key;
-  TextNormalizer::NormalizePreeditText(key, &normalized_key);
+  TextNormalizer::NormalizeText(key, &normalized_key);
 
   string normalized_value;
   if (segment_type_mask & PREEDIT) {
-    TextNormalizer::NormalizePreeditText(value, &normalized_value);
+    TextNormalizer::NormalizeText(value, &normalized_value);
   } else if (segment_type_mask & CONVERSION) {
     normalized_value = value;
   } else {
@@ -462,7 +462,7 @@ void SessionOutput::FillConversionResult(const string &key,
                                          commands::Result *result_proto) {
   // Key should be normalized as a preedit text.
   string normalized_key;
-  TextNormalizer::NormalizePreeditText(key, &normalized_key);
+  TextNormalizer::NormalizeText(key, &normalized_key);
 
   // value is already normalized by converter.
   FillConversionResultWithoutNormalization(
@@ -473,7 +473,7 @@ void SessionOutput::FillConversionResult(const string &key,
 void SessionOutput::FillPreeditResult(const string &preedit,
                                       commands::Result *result_proto) {
   string normalized_preedit;
-  TextNormalizer::NormalizePreeditText(preedit, &normalized_preedit);
+  TextNormalizer::NormalizeText(preedit, &normalized_preedit);
 
   FillConversionResultWithoutNormalization(
       normalized_preedit, normalized_preedit, result_proto);