Skip to content

Commit

Permalink
GH-233: Add lang_detect module (#733)
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Jun 8, 2024
1 parent dd19616 commit 0793386
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 1 deletion.
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,31 @@ Satisfaction, guaranteed.
```
</details>

<details>
<summary><b><a href="">Lang Detect</a></b> - Identifying the Language of Text
<code>⚛️</code>
</summary>

<br/>

Lang Detect API. Thanks to awesome work from [FastText](https://fasttext.cc/docs/en/language-identification.html)

Install extend dependencies and models

```bash
$ pip install underthesea[lang-detect]
```

Usage examples in script

```python
>>> from underthesea.pipeline.lang_detect import lang_detect

>>> lang_detect("Cựu binh Mỹ trả nhật ký nhẹ lòng khi thấy cuộc sống hòa bình tại Việt Nam")
vi
```
</details>

<details>
<summary><b><a href="">Say 🗣️</a></b> - Converting written text into spoken audio
<code>⚛️</code>
Expand Down Expand Up @@ -363,7 +388,7 @@ Resource CP_Vietnamese_VLC_v2_2022 is downloaded in ~/.underthesea/datasets/CP_V

* Automatic Speech Recognition
* Machine Translation
* Chatbot (Chat & Speak)
* Chatbot Agent

## Contributing

Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
],
'prompt': [
'openai'
],
'lang-detect': [
'fasttext '
]
}
setup(
Expand Down
Empty file.
20 changes: 20 additions & 0 deletions tests/pipeline/lang_detect/test_lang_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
from unittest import TestCase
from underthesea import lang_detect


class TestLangDetect(TestCase):
def test_lang_detect_1(self):
actual = lang_detect("Bộ Công Thương xóa một tổng cục, giảm nhiều đầu mối")
expected = "vi"
self.assertEqual(actual, expected)

def test_lang_detect_2(self):
actual = lang_detect("Ceci est un texte français.")
expected = "fr"
self.assertEqual(actual, expected)

def test_lang_detect_3(self):
actual = lang_detect("如來の妙色身、 世間與に等しきは無し。")
expected = "ja"
self.assertEqual(actual, expected)
6 changes: 6 additions & 0 deletions underthesea/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@
except Exception:
pass

try:
from underthesea.pipeline.lang_detect import lang_detect
except Exception as e:
print(e)


# lazy loading
def dependency_parse(*args, **kwargs):
Expand All @@ -61,6 +66,7 @@ def dependency_parse(*args, **kwargs):
'text_normalize',
'word_tokenize', 'pos_tag', 'chunk',
'ner',
'lang_detect',
'classify', 'sentiment',
'dependency_parse'
]
12 changes: 12 additions & 0 deletions underthesea/model_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class UTSModel(Enum):
sa_general = "SA_GENERAL"
sa_bank = "SA_BANK"
sa_bank_v131 = "SA_BANK_V131"
lang_detect_fast_text = "LANG_DETECT_FAST_TEXT"

# flake8: noqa: C901

Expand Down Expand Up @@ -161,6 +162,10 @@ def download(model_name):
if model_name == "VIET_TTS_V0_4_1":
ModelFetcher.download_zip(REPO[model_name])

if model_name == "LANG_DETECT_FAST_TEXT":
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
cached_path(url, cache_dir=cache_dir)

@staticmethod
def list(all=False):
models = []
Expand Down Expand Up @@ -206,6 +211,13 @@ def get_model_path(model):

if model == UTSModel.sa_bank:
return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"

if model == UTSModel.sa_bank:
return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"

if model == "LANG_DETECT_FAST_TEXT":
return Path(UNDERTHESEA_FOLDER) / "models" / "lid.176.bin"

return Path(UNDERTHESEA_FOLDER) / "models" / model


Expand Down
8 changes: 8 additions & 0 deletions underthesea/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,12 @@ VIET_TTS_V0_4_1:
year: 2023
url: https://github.com/undertheseanlp/underthesea/releases/download/resources/viet_tts_v0.4.1.zip
filename: viet_tts_v0.4.1.zip
LANG_DETECT_FAST_TEXT:
cache_dir: models
model_path: LANG_DETECT_FAST_TEXT
type: Lang Detect
license: Open
year: 2020
url: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
filename: lid.176.bin

23 changes: 23 additions & 0 deletions underthesea/pipeline/lang_detect/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import fasttext
import os
from underthesea.model_fetcher import ModelFetcher

fasttext.FastText.eprint = lambda x: None
lang_detect_model = None


def lang_detect(text):
global lang_detect_model
model_name = "LANG_DETECT_FAST_TEXT"
model_path = ModelFetcher.get_model_path(model_name)
if not lang_detect_model:
if not os.path.exists(model_path):
ModelFetcher.download(model_name)
try:
lang_detect_model = fasttext.load_model(str(model_path))
except Exception:
pass

predictions = lang_detect_model.predict(text)
language = predictions[0][0].replace('__label__', '')
return language

0 comments on commit 0793386

Please sign in to comment.