undertheseanlp · rain1024 · Jun 8, 2024 · Jun 8, 2024 · Jun 8, 2024 · Jun 8, 2024
diff --git a/README.md b/README.md
@@ -292,6 +292,31 @@ Satisfaction, guaranteed.
     ```
 </details>
 
+<details>
+<summary><b><a href="">Lang Detect</a></b> - Identifying the Language of Text
+<code>⚛️</code>
+</summary>
+
+<br/>
+
+Lang Detect API. Thanks to awesome work from [FastText](https://fasttext.cc/docs/en/language-identification.html)
+
+Install extend dependencies and models
+
+    ```bash
+    $ pip install underthesea[lang-detect]
+    ```
+
+Usage examples in script
+
+    ```python
+    >>> from underthesea.pipeline.lang_detect import lang_detect
+
+    >>> lang_detect("Cựu binh Mỹ trả nhật ký nhẹ lòng khi thấy cuộc sống hòa bình tại Việt Nam")
+    vi
+    ```
+</details>
+
 <details>
 <summary><b><a href="">Say 🗣️</a></b> - Converting written text into spoken audio
 <code>⚛️</code>
@@ -363,7 +388,7 @@ Resource CP_Vietnamese_VLC_v2_2022 is downloaded in ~/.underthesea/datasets/CP_V
 
 * Automatic Speech Recognition
 * Machine Translation
-* Chatbot (Chat & Speak)
+* Chatbot Agent
 
 ## Contributing
 

diff --git a/setup.py b/setup.py
@@ -42,6 +42,9 @@
     ],
     'prompt': [
         'openai'
+    ],
+    'lang-detect': [
+        'fasttext   '
     ]
 }
 setup(

diff --git a/tests/pipeline/lang_detect/__init__.py b/tests/pipeline/lang_detect/__init__.py
diff --git a/tests/pipeline/lang_detect/test_lang_detect.py b/tests/pipeline/lang_detect/test_lang_detect.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+from unittest import TestCase
+from underthesea import lang_detect
+
+
+class TestLangDetect(TestCase):
+    def test_lang_detect_1(self):
+        actual = lang_detect("Bộ Công Thương xóa một tổng cục, giảm nhiều đầu mối")
+        expected = "vi"
+        self.assertEqual(actual, expected)
+
+    def test_lang_detect_2(self):
+        actual = lang_detect("Ceci est un texte français.")
+        expected = "fr"
+        self.assertEqual(actual, expected)
+
+    def test_lang_detect_3(self):
+        actual = lang_detect("如來の妙色身、 世間與に等しきは無し。")
+        expected = "ja"
+        self.assertEqual(actual, expected)
diff --git a/underthesea/__init__.py b/underthesea/__init__.py
@@ -49,6 +49,11 @@
 except Exception:
     pass
 
+try:
+    from underthesea.pipeline.lang_detect import lang_detect
+except Exception as e:
+    print(e)
+
 
 # lazy loading
 def dependency_parse(*args, **kwargs):
@@ -61,6 +66,7 @@ def dependency_parse(*args, **kwargs):
     'text_normalize',
     'word_tokenize', 'pos_tag', 'chunk',
     'ner',
+    'lang_detect',
     'classify', 'sentiment',
     'dependency_parse'
 ]
diff --git a/underthesea/model_fetcher.py b/underthesea/model_fetcher.py
@@ -23,6 +23,7 @@ class UTSModel(Enum):
     sa_general = "SA_GENERAL"
     sa_bank = "SA_BANK"
     sa_bank_v131 = "SA_BANK_V131"
+    lang_detect_fast_text = "LANG_DETECT_FAST_TEXT"
 
 # flake8: noqa: C901
 
@@ -161,6 +162,10 @@ def download(model_name):
         if model_name == "VIET_TTS_V0_4_1":
             ModelFetcher.download_zip(REPO[model_name])
 
+        if model_name == "LANG_DETECT_FAST_TEXT":
+            url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
+            cached_path(url, cache_dir=cache_dir)
+
     @staticmethod
     def list(all=False):
         models = []
@@ -206,6 +211,13 @@ def get_model_path(model):
 
         if model == UTSModel.sa_bank:
             return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"
+
+        if model == UTSModel.sa_bank:
+            return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"
+
+        if model == "LANG_DETECT_FAST_TEXT":
+            return Path(UNDERTHESEA_FOLDER) / "models" / "lid.176.bin"
+
         return Path(UNDERTHESEA_FOLDER) / "models" / model
 
 

diff --git a/underthesea/models.yaml b/underthesea/models.yaml
@@ -48,4 +48,12 @@ VIET_TTS_V0_4_1:
   year: 2023
   url: https://github.com/undertheseanlp/underthesea/releases/download/resources/viet_tts_v0.4.1.zip
   filename: viet_tts_v0.4.1.zip
+LANG_DETECT_FAST_TEXT:
+  cache_dir: models
+  model_path: LANG_DETECT_FAST_TEXT
+  type: Lang Detect
+  license: Open
+  year: 2020
+  url: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
+  filename: lid.176.bin
 
diff --git a/underthesea/pipeline/lang_detect/__init__.py b/underthesea/pipeline/lang_detect/__init__.py
@@ -0,0 +1,23 @@
+import fasttext
+import os
+from underthesea.model_fetcher import ModelFetcher
+
+fasttext.FastText.eprint = lambda x: None
+lang_detect_model = None
+
+
+def lang_detect(text):
+    global lang_detect_model
+    model_name = "LANG_DETECT_FAST_TEXT"
+    model_path = ModelFetcher.get_model_path(model_name)
+    if not lang_detect_model:
+        if not os.path.exists(model_path):
+            ModelFetcher.download(model_name)
+        try:
+            lang_detect_model = fasttext.load_model(str(model_path))
+        except Exception:
+            pass
+
+    predictions = lang_detect_model.predict(text)
+    language = predictions[0][0].replace('__label__', '')
+    return language