Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-233: Add lang_detect module #733

Merged
merged 3 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,31 @@ Satisfaction, guaranteed.
```
</details>

<details>
<summary><b><a href="">Lang Detect</a></b> - Identifying the Language of Text
<code>⚛️</code>
</summary>

<br/>

Lang Detect API. Thanks to awesome work from [FastText](https://fasttext.cc/docs/en/language-identification.html)

Install extend dependencies and models

```bash
$ pip install underthesea[lang-detect]
```

Usage examples in script

```python
>>> from underthesea.pipeline.lang_detect import lang_detect

>>> lang_detect("Cựu binh Mỹ trả nhật ký nhẹ lòng khi thấy cuộc sống hòa bình tại Việt Nam")
vi
```
</details>

<details>
<summary><b><a href="">Say 🗣️</a></b> - Converting written text into spoken audio
<code>⚛️</code>
Expand Down Expand Up @@ -363,7 +388,7 @@ Resource CP_Vietnamese_VLC_v2_2022 is downloaded in ~/.underthesea/datasets/CP_V

* Automatic Speech Recognition
* Machine Translation
* Chatbot (Chat & Speak)
* Chatbot Agent

## Contributing

Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
],
'prompt': [
'openai'
],
'lang-detect': [
'fasttext '
]
}
setup(
Expand Down
Empty file.
20 changes: 20 additions & 0 deletions tests/pipeline/lang_detect/test_lang_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
from unittest import TestCase
from underthesea import lang_detect


class TestLangDetect(TestCase):
def test_lang_detect_1(self):
actual = lang_detect("Bộ Công Thương xóa một tổng cục, giảm nhiều đầu mối")
expected = "vi"
self.assertEqual(actual, expected)

def test_lang_detect_2(self):
actual = lang_detect("Ceci est un texte français.")
expected = "fr"
self.assertEqual(actual, expected)

def test_lang_detect_3(self):
actual = lang_detect("如來の妙色身、 世間與に等しきは無し。")
expected = "ja"
self.assertEqual(actual, expected)
6 changes: 6 additions & 0 deletions underthesea/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@
except Exception:
pass

try:
from underthesea.pipeline.lang_detect import lang_detect
except Exception as e:
print(e)


# lazy loading
def dependency_parse(*args, **kwargs):
Expand All @@ -61,6 +66,7 @@ def dependency_parse(*args, **kwargs):
'text_normalize',
'word_tokenize', 'pos_tag', 'chunk',
'ner',
'lang_detect',
'classify', 'sentiment',
'dependency_parse'
]
12 changes: 12 additions & 0 deletions underthesea/model_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class UTSModel(Enum):
sa_general = "SA_GENERAL"
sa_bank = "SA_BANK"
sa_bank_v131 = "SA_BANK_V131"
lang_detect_fast_text = "LANG_DETECT_FAST_TEXT"

# flake8: noqa: C901

Expand Down Expand Up @@ -161,6 +162,10 @@ def download(model_name):
if model_name == "VIET_TTS_V0_4_1":
ModelFetcher.download_zip(REPO[model_name])

if model_name == "LANG_DETECT_FAST_TEXT":
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
cached_path(url, cache_dir=cache_dir)

@staticmethod
def list(all=False):
models = []
Expand Down Expand Up @@ -206,6 +211,13 @@ def get_model_path(model):

if model == UTSModel.sa_bank:
return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"

if model == UTSModel.sa_bank:
return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"

if model == "LANG_DETECT_FAST_TEXT":
return Path(UNDERTHESEA_FOLDER) / "models" / "lid.176.bin"

return Path(UNDERTHESEA_FOLDER) / "models" / model


Expand Down
8 changes: 8 additions & 0 deletions underthesea/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,12 @@ VIET_TTS_V0_4_1:
year: 2023
url: https://github.com/undertheseanlp/underthesea/releases/download/resources/viet_tts_v0.4.1.zip
filename: viet_tts_v0.4.1.zip
LANG_DETECT_FAST_TEXT:
cache_dir: models
model_path: LANG_DETECT_FAST_TEXT
type: Lang Detect
license: Open
year: 2020
url: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
filename: lid.176.bin

23 changes: 23 additions & 0 deletions underthesea/pipeline/lang_detect/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import fasttext
import os
from underthesea.model_fetcher import ModelFetcher

fasttext.FastText.eprint = lambda x: None
lang_detect_model = None


def lang_detect(text):
global lang_detect_model
model_name = "LANG_DETECT_FAST_TEXT"
model_path = ModelFetcher.get_model_path(model_name)
if not lang_detect_model:
if not os.path.exists(model_path):
ModelFetcher.download(model_name)
try:
lang_detect_model = fasttext.load_model(str(model_path))
except Exception:
pass

predictions = lang_detect_model.predict(text)
language = predictions[0][0].replace('__label__', '')
return language
Loading