Skip to content

Commit

Permalink
Reconsider Shift-JIS-based normalizing
Browse files Browse the repository at this point in the history
On Windows Mozc has had several nomarization rules to avoid possible
compatibility and interoperability reasons.

With this CL, we basically remove those special rules except for the
following two normalization rules:
 - 0x301C WAVE DASH  -> 0xFF5E FULLWIDTH TILDE
 - 0x2212 MINUS SIGN -> 0xFF0D FULLWIDTH HYPHEN MINUS

Here are some examples of behavior changes.

 Case A:
  1. Type "えん"
  2. Hit space key to trigger conversion.
  3. Choose "¥ [半] 円記号 <機種依存文字>"

  Current behavior: U+005C is always committed.
      New behavior: U+00A5 is committed if it's U+00A5
                    (Compatible with MS-IME on Windows 10)

 Case B:
  1. Type "U+00A5"
  2. Hit space key to trigger conversion.
  3. Choose "¥ [半] 円記号 <機種依存文字>"

  Current behavior: U+005C is always committed.
      New behavior: U+00A5 is committed.

 Case C:
  1. Type "たてぼう"
  2. Hit space key to trigger conversion.
  3. Choose "‖ [全] 縦線"

  Current behavior: U+2225 is always committed.
                    (MS-IME on Windows 10 only shows this)
      New behavior: Both U+2016 and U+2225 can be committed.

 Case D:
  1. Type "うえ"
  2. Hit space key to trigger conversion.
  3. Choose "‾ [全] オーバライン"

  Current behavior: U+007E is always committed.
      New behavior: U+203E is committed.
                    (Compatible with MS-IME on Windows 10
                     if you choose "‾ [環境依存]")

  Note that MS-IME on Windows 10 also converts "おーばーらいん" to "‾".

 Case E:
  1. Select "¢" (U+00A2)
  2. Hit 変換 key to trigger reconversion.
  3. Choose the top candidate.

  Current behavior: U+FFE0 is always committed.
      New behavior: U+00A2 is committed.
                    (Compatible with MS-IME behavior)

 Case F:
  1. Select "£" (U+00A3)
  2. Hit 変換 key to trigger reconversion.
  3. Choose the top candidate.

  Current behavior: U+FFE2 is always committed.
      New behavior: U+00A3 is committed.
                    (Compatible with MS-IME behavior)

 Case G:
  1. Select "¬" (U+00AC)
  2. Hit 変換 key to trigger reconversion.
  3. Choose the top candidate.

  Current behavior: U+FFE2 is always committed.
      New behavior: U+00AC is committed.
                    (Compatible with MS-IME behavior)

 Case H:
  1. Type "vaiorin" in Romaji mode

  Current behavior: "ヴぁいおりん"
                    (Compatible with MS-IME behavior on Windows 10)
      New behavior: "ゔぁいおりん"

BUG=
TEST=
REF_BUG=26674144
REF_CL=113128015,113129243,113232446
REF_TIME=2016-01-27T14:50:45+09:00
REF_TIME_RAW=1453873845 +0900
  • Loading branch information
hiroyuki-komatsu committed Jan 27, 2016
1 parent 1a6e539 commit 26241b0
Show file tree
Hide file tree
Showing 15 changed files with 90 additions and 258 deletions.
65 changes: 13 additions & 52 deletions src/base/text_normalizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
namespace mozc {
namespace {

#ifdef OS_WIN
// Unicode vender specific character table:
// http://hp.vector.co.jp/authors/VA010341/unicode/
// http://www.notoinsatu.co.jp/font/omake/OTF_other.pdf
Expand All @@ -45,75 +46,35 @@ namespace {
// Windows CP932 (shift-jis) maps WAVE_DASH to FULL_WIDTH_TILDA.
// Since the font of WAVE-DASH is ugly on Windows, here we convert WAVE-DHASH to
// FULL_WIDTH_TILDA as CP932 does.
#ifdef OS_WIN
inline char32 ConvertVenderSpecificCharacter(char32 c) {
//
// As Unicode has became the defact default encoding. We have reduced
// the number of characters to be normalized.
inline char32 NormalizeCharForWindows(char32 c) {
switch (c) {
case 0x00A5: // YEN SIGN
return 0x005C; // REVERSE SOLIDUS
break;
case 0x203E: // OVERLINE
return 0x007E; // TILDE
break;
case 0x301C: // WAVE DASH
return 0xFF5E; // FULLWIDTH TILDE
break;
case 0x2016: // DOUBLE VERTICAL LINE
return 0x2225; // PARALLEL TO
break;
case 0x2212: // MINUS SIGN
return 0xFF0D; // FULLWIDTH HYPHEN MINUS
break;
case 0x00A2: // CENT SIGN
return 0xFFE0; // FULLWIDTH CENT SIGN
break;
case 0x00A3: // POUND SIGN
return 0xFFE1; // FULLWIDTH POUND SIGN
break;
case 0x00AC: // NOT SIGN
return 0xFFE2; // FULLWIDTH NOT SIGN
break;
default:
return c;
break;
}
}
#endif // OS_WIN

#else // MAC & Linux
inline char32 ConvertVenderSpecificCharacter(char32 c) {
return c;
}
#endif
} // namespace

void ConvertVenderSpecificString(StringPiece input, string *output) {
void TextNormalizer::NormalizeText(StringPiece input, string *output) {
#ifdef OS_WIN
output->clear();
for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) {
Util::UCS4ToUTF8Append(ConvertVenderSpecificCharacter(iter.Get()), output);
Util::UCS4ToUTF8Append(NormalizeCharForWindows(iter.Get()), output);
}
}

} // namespace

void TextNormalizer::NormalizePreeditText(StringPiece input, string *output) {
string tmp;
// This is a workaround for hiragana v'
// Util::StringReplace(input, "ゔ", "ヴ", true, &tmp);
Util::StringReplace(input, "\xE3\x82\x94", "\xE3\x83\xB4", true, &tmp);
ConvertVenderSpecificString(tmp, output);
}

void TextNormalizer::NormalizeTransliterationText(StringPiece input,
string *output) {
// Do the same thing with NormalizePreeditText at this morment.
NormalizePreeditText(input, output);
}

void TextNormalizer::NormalizeConversionText(StringPiece input,
string *output) {
ConvertVenderSpecificString(input, output);
}

void TextNormalizer::NormalizeCandidateText(StringPiece input, string *output) {
ConvertVenderSpecificString(input, output);
#else
input.CopyToString(output);
#endif
}

} // namespace mozc
5 changes: 1 addition & 4 deletions src/base/text_normalizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ namespace mozc {

class TextNormalizer {
public:
static void NormalizePreeditText(StringPiece input, string *output);
static void NormalizeTransliterationText(StringPiece input, string *output);
static void NormalizeConversionText(StringPiece input, string *output);
static void NormalizeCandidateText(StringPiece input, string *output);
static void NormalizeText(StringPiece input, string *output);

private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TextNormalizer);
Expand Down
82 changes: 25 additions & 57 deletions src/base/text_normalizer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,84 +36,52 @@

namespace mozc {

TEST(TextNormalizerTest, NormalizePreeditText) {
TEST(TextNormalizerTest, NormalizeText) {
string output;
// "めかぶ"
TextNormalizer::NormalizePreeditText(
"\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", &output);
TextNormalizer::NormalizeText("\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6",
&output);
// "めかぶ"
EXPECT_EQ("\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", output);

// "ゔぁいおりん"
TextNormalizer::NormalizePreeditText("\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
"\xe3\x81\x8a\xe3\x82\x8a\xe3\x82"
"\x93", &output);
// "ヴぁいおりん"
EXPECT_EQ("\xe3\x83\xb4\xe3\x81\x81\xe3\x81\x84\xe3\x81\x8a\xe3\x82\x8a\xe3"
"\x82\x93", output);
TextNormalizer::NormalizeText("\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
"\xe3\x81\x8a\xe3\x82\x8a\xe3\x82\x93",
&output);
// "ゔぁいおりん"
EXPECT_EQ("\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
"\xe3\x81\x8a\xe3\x82\x8a\xe3\x82\x93",
output);

// "ぐ〜ぐる"
TextNormalizer::NormalizePreeditText("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90"
"\xe3\x82\x8b", &output);
TextNormalizer::NormalizeText("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90"
"\xe3\x82\x8b", &output);
#ifdef OS_WIN
// "ぐ~ぐる"
EXPECT_EQ("\xe3\x81\x90\xef\xbd\x9e\xe3\x81\x90\xe3\x82\x8b", output);
#else
// "ぐ〜ぐる"
EXPECT_EQ("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90\xe3\x82\x8b", output);
#endif
}

TEST(TextNormalizerTest, NormalizeTransliterationText) {
string output;
// "めかぶ"
TextNormalizer::NormalizeTransliterationText(
"\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", &output);
// "めかぶ"
EXPECT_EQ("\xe3\x82\x81\xe3\x81\x8b\xe3\x81\xb6", output);

// "ゔぁいおりん"
TextNormalizer::NormalizeTransliterationText(
"\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84"
"\xe3\x81\x8a\xe3\x82\x8a\xe3\x82\x93",
// "1−2−3": "−" is U+2212
TextNormalizer::NormalizeText(
"\xEF\xBC\x91\xE2\x88\x92\xEF\xBC\x92\xE2\x88\x92\xEF\xBC\x93",
&output);
// "ヴぁいおりん"
EXPECT_EQ("\xe3\x83\xb4\xe3\x81\x81\xe3\x81\x84\xe3\x81\x8a\xe3\x82\x8a\xe3"
"\x82\x93", output);

// "ぐ〜ぐる"
TextNormalizer::NormalizeTransliterationText(
"\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90\xe3\x82\x8b", &output);
#ifdef OS_WIN
// "ぐ~ぐる"
EXPECT_EQ("\xe3\x81\x90\xef\xbd\x9e\xe3\x81\x90\xe3\x82\x8b", output);
// "1-2-3": "-" is U+FF0D
EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x8D\xEF\xBC\x92\xEF\xBC\x8D\xEF\xBC\x93",
output);
#else
// "ぐ〜ぐる"
EXPECT_EQ("\xe3\x81\x90\xe3\x80\x9c\xe3\x81\x90\xe3\x82\x8b", output);
// "1−2−3": "−" is U+2212
EXPECT_EQ("\xEF\xBC\x91\xE2\x88\x92\xEF\xBC\x92\xE2\x88\x92\xEF\xBC\x93",
output);
#endif
}

TEST(TextNormalizerTest, NormalizeCandidateText) {
#ifdef OS_WIN
string output;
// "ぐ〜ぐる"
TextNormalizer::NormalizeCandidateText("\xe3\x81\x90\xe3\x80\x9c\xe3\x81"
"\x90\xe3\x82\x8b", &output);
// "ぐ~ぐる"
EXPECT_EQ("\xe3\x81\x90\xef\xbd\x9e\xe3\x81\x90\xe3\x82\x8b", output);
// "−"
TextNormalizer::NormalizeCandidateText("\xe2\x88\x92", &output);
// "-"
EXPECT_EQ("\xef\xbc\x8d", output);
// "¢"
TextNormalizer::NormalizeCandidateText("\xc2\xa2", &output);
// "¢"
EXPECT_EQ("\xef\xbf\xa0", output);
// "‖"
TextNormalizer::NormalizeCandidateText("\xe2\x80\x96", &output);
// "∥"
EXPECT_EQ("\xe2\x88\xa5", output);
#endif
// "¥298": "¥" is U+00A5
TextNormalizer::NormalizeText("\xC2\xA5\x32\x39\x38", &output);
// U+00A5 is no longer normalized.
EXPECT_EQ("\xC2\xA5\x32\x39\x38", output);
}

} // namespace mozc
2 changes: 1 addition & 1 deletion src/converter/quality_regression_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ bool QualityRegressionUtil::TestItem::ParseFromTSV(const string &line) {
}
tokens[0].CopyToString(&label);
tokens[1].CopyToString(&key);
TextNormalizer::NormalizeCandidateText(tokens[2], &expected_value);
TextNormalizer::NormalizeText(tokens[2], &expected_value);
tokens[3].CopyToString(&command);
expected_rank = NumberUtil::SimpleAtoi(tokens[4]);
NumberUtil::SafeStrToDouble(tokens[5], &accuracy);
Expand Down
1 change: 1 addition & 0 deletions src/data/symbol/symbol.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ POS CHAR Reading (space separated) description additional description category m
記号 すらっしゅ しゃせん ・ / きごう ななめ スラッシュ GENERAL
記号 ばっくすらっしゅ ぎゃくしゃせん ・ / \ きごう しゃせん すらっしゅ ななめ バックスラッシュ GENERAL
記号 にょろ なみ から ー - ~ きごう より 波ダッシュ GENERAL
記号 | || たてぼう たてせん きごう 縦線 SYMBOL
記号 | || たてぼう たてせん きごう 縦線 SYMBOL
記号 | たてぼう たてせん きごう 縦線 GENERAL
句読点 。。。 . ... ・・・ ・ てん きごう あまり さんてん さんてんりーだ 三点リーダ SYMBOL OTHER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,13 @@ EXPECT_PREEDIT う
SEND_KEYS *
EXPECT_PREEDIT ぅ
SEND_KEYS *
EXPECT_PREEDIT
EXPECT_PREEDIT
SEND_KEYS *
EXPECT_PREEDIT う
SEND_KEYS *
EXPECT_PREEDIT ぅ
SEND_KEYS *
EXPECT_PREEDIT
EXPECT_PREEDIT
SEND_KEYS *
EXPECT_PREEDIT う
SEND_KEYS 1
Expand All @@ -187,9 +187,9 @@ EXPECT_PREEDIT ぅあ

RESET_CONTEXT
SEND_KEYS 111**
EXPECT_PREEDIT
EXPECT_PREEDIT
SEND_KEYS 1
EXPECT_PREEDIT ヴあ
EXPECT_PREEDIT ゔあ

RESET_CONTEXT
SEND_KEYS 1111
Expand Down
2 changes: 1 addition & 1 deletion src/mozc_version_template.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MAJOR=2
MINOR=17
BUILD=2422
BUILD=2423
REVISION=102
# NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
# downloaded by NaCl Mozc.
Expand Down
11 changes: 3 additions & 8 deletions src/rewriter/normalization_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,10 @@ bool NormalizeCandidate(Segment::Candidate *candidate,

string value, content_value;
switch (type) {
case CANDIDATE:
TextNormalizer::NormalizeCandidateText(candidate->value, &value);
TextNormalizer::NormalizeCandidateText(candidate->content_value,
&content_value);
break;
case CANDIDATE: // Go through to TRANSLITERATION
case TRANSLITERATION:
TextNormalizer::NormalizeTransliterationText(candidate->value, &value);
TextNormalizer::NormalizeTransliterationText(candidate->content_value,
&content_value);
TextNormalizer::NormalizeText(candidate->value, &value);
TextNormalizer::NormalizeText(candidate->content_value, &content_value);
break;
default:
LOG(ERROR) << "unkown type";
Expand Down
3 changes: 1 addition & 2 deletions src/rewriter/transliteration_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ void NormalizeT13ns(vector<string> *t13ns) {
string normalized;
for (size_t i = 0; i < t13ns->size(); ++i) {
normalized.clear();
TextNormalizer::NormalizeTransliterationText(
t13ns->at(i), &normalized);
TextNormalizer::NormalizeText(t13ns->at(i), &normalized);
t13ns->at(i) = normalized;
}
}
Expand Down
29 changes: 0 additions & 29 deletions src/rewriter/transliteration_rewriter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -598,35 +598,6 @@ TEST_F(TransliterationRewriterTest, NoRewriteTest) {
EXPECT_EQ(0, segments.conversion_segment(0).meta_candidates_size());
}

TEST_F(TransliterationRewriterTest, NormalizedTransliterations) {
std::unique_ptr<TransliterationRewriter> t13n_rewriter(
CreateTransliterationRewriter());

composer::Table table;
table.InitializeWithRequestAndConfig(default_request(), default_config());
composer::Composer composer(&table, &default_request(), &default_config());

// "らゔ"
composer.InsertCharacterPreedit("\xE3\x82\x89\xE3\x82\x94");

Segments segments;
{ // Initialize segments.
Segment *segment = segments.add_segment();
CHECK(segment);
// "らゔ"
segment->set_key("\xE3\x82\x89\xE3\x82\x94");
segment->add_candidate()->value = "LOVE";
}

ConversionRequest request(&composer, &default_request(), &default_config());
EXPECT_TRUE(t13n_rewriter->Rewrite(request, &segments));
EXPECT_EQ(1, segments.segments_size());
const Segment &seg = segments.segment(0);
// "らヴ"
EXPECT_EQ("\xE3\x82\x89\xE3\x83\xB4",
seg.meta_candidate(transliteration::HIRAGANA).value);
}

TEST_F(TransliterationRewriterTest, MobileEnvironmentTest) {
ConversionRequest convreq;
commands::Request request;
Expand Down
8 changes: 4 additions & 4 deletions src/session/internal/session_output.cc
Original file line number Diff line number Diff line change
Expand Up @@ -380,11 +380,11 @@ bool SessionOutput::AddSegment(const string &key,
commands::Preedit *preedit) {
// Key is always normalized as a preedit text.
string normalized_key;
TextNormalizer::NormalizePreeditText(key, &normalized_key);
TextNormalizer::NormalizeText(key, &normalized_key);

string normalized_value;
if (segment_type_mask & PREEDIT) {
TextNormalizer::NormalizePreeditText(value, &normalized_value);
TextNormalizer::NormalizeText(value, &normalized_value);
} else if (segment_type_mask & CONVERSION) {
normalized_value = value;
} else {
Expand Down Expand Up @@ -462,7 +462,7 @@ void SessionOutput::FillConversionResult(const string &key,
commands::Result *result_proto) {
// Key should be normalized as a preedit text.
string normalized_key;
TextNormalizer::NormalizePreeditText(key, &normalized_key);
TextNormalizer::NormalizeText(key, &normalized_key);

// value is already normalized by converter.
FillConversionResultWithoutNormalization(
Expand All @@ -473,7 +473,7 @@ void SessionOutput::FillConversionResult(const string &key,
void SessionOutput::FillPreeditResult(const string &preedit,
commands::Result *result_proto) {
string normalized_preedit;
TextNormalizer::NormalizePreeditText(preedit, &normalized_preedit);
TextNormalizer::NormalizeText(preedit, &normalized_preedit);

FillConversionResultWithoutNormalization(
normalized_preedit, normalized_preedit, result_proto);
Expand Down
Loading

0 comments on commit 26241b0

Please sign in to comment.