From 1333fc2588c3730bed79d7cf7be8b7610d3640c1 Mon Sep 17 00:00:00 2001
From: "Santiago M. Mola" <santi@mola.io>
Date: Mon, 31 Jul 2017 14:07:28 +0200
Subject: [PATCH] tokenizer: capture non-ASCII identifiers

`\w` captures only ASCII letters and numbers. Changed to [[:alnum:]]
to capture any Unicode letter and digit. This makes the tokenizer
work properly on non-English based langiages (e.g. 1C Enterprise).
---
 lib/linguist/tokenizer.rb | 2 +-
 test/test_tokenizer.rb    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb
index 7b61804964..4342a2aaa2 100644
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -107,7 +107,7 @@ def extract_tokens(data)
           tokens << token
 
         # Regular token
-        elsif token = s.scan(/[\w\.@#\/\*]+/)
+        elsif token = s.scan(/[[[:alnum:]]_\.@#\/\*]+/)
           tokens << token
 
         # Common operators
diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb
index b8b486cb0d..44844ca4c2 100644
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -113,4 +113,8 @@ def test_ruby_tokens
     assert_equal %w(module Foo end), tokenize(:"Ruby/foo.rb")
     assert_equal %w(task default do puts end), tokenize(:"Ruby/filenames/Rakefile")
   end
+
+  def test_utf8_tokens
+    assert_equal %w(Функция الكون), tokenize(:"Функция الكون".to_s)
+  end
 end