diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index 7b61804964..4342a2aaa2 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -107,7 +107,7 @@ def extract_tokens(data) tokens << token # Regular token - elsif token = s.scan(/[\w\.@#\/\*]+/) + elsif token = s.scan(/[[[:alnum:]]_\.@#\/\*]+/) tokens << token # Common operators diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb index b8b486cb0d..44844ca4c2 100644 --- a/test/test_tokenizer.rb +++ b/test/test_tokenizer.rb @@ -113,4 +113,8 @@ def test_ruby_tokens assert_equal %w(module Foo end), tokenize(:"Ruby/foo.rb") assert_equal %w(task default do puts end), tokenize(:"Ruby/filenames/Rakefile") end + + def test_utf8_tokens + assert_equal %w(Функция الكون), tokenize(:"Функция الكون".to_s) + end end