From 1333fc2588c3730bed79d7cf7be8b7610d3640c1 Mon Sep 17 00:00:00 2001 From: "Santiago M. Mola" Date: Mon, 31 Jul 2017 14:07:28 +0200 Subject: [PATCH] tokenizer: capture non-ASCII identifiers `\w` captures only ASCII letters and numbers. Changed to [[:alnum:]] to capture any Unicode letter and digit. This makes the tokenizer work properly on non-English based langiages (e.g. 1C Enterprise). --- lib/linguist/tokenizer.rb | 2 +- test/test_tokenizer.rb | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index 7b61804964..4342a2aaa2 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -107,7 +107,7 @@ def extract_tokens(data) tokens << token # Regular token - elsif token = s.scan(/[\w\.@#\/\*]+/) + elsif token = s.scan(/[[[:alnum:]]_\.@#\/\*]+/) tokens << token # Common operators diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb index b8b486cb0d..44844ca4c2 100644 --- a/test/test_tokenizer.rb +++ b/test/test_tokenizer.rb @@ -113,4 +113,8 @@ def test_ruby_tokens assert_equal %w(module Foo end), tokenize(:"Ruby/foo.rb") assert_equal %w(task default do puts end), tokenize(:"Ruby/filenames/Rakefile") end + + def test_utf8_tokens + assert_equal %w(Функция الكون), tokenize(:"Функция الكون".to_s) + end end