From a525f543f95e713825753c0cba71a30e98cd47cc Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Wed, 5 Jun 2024 10:22:52 -0500 Subject: [PATCH 1/2] BoW --- .github/workflows/pip.yaml | 2 +- .github/workflows/test.yaml | 2 +- dialectid/__init__.py | 4 +- dialectid/tests/test_text_repr.py | 34 +++++++++++++++ dialectid/tests/test_utils.py | 7 ++++ dialectid/text_repr.py | 69 +++++++++++++++++++++++++++++++ dialectid/utils.py | 34 ++++++++++++++- pyproject.toml | 5 ++- 8 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 dialectid/tests/test_text_repr.py create mode 100644 dialectid/text_repr.py diff --git a/.github/workflows/pip.yaml b/.github/workflows/pip.yaml index 4e52ef3..f41cdd0 100644 --- a/.github/workflows/pip.yaml +++ b/.github/workflows/pip.yaml @@ -19,7 +19,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: activate-environment: test auto-update-conda: true diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 404f24e..3680cd6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -13,7 +13,7 @@ jobs: shell: bash -l {0} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v2 diff --git a/dialectid/__init__.py b/dialectid/__init__.py index f322558..e83f481 100644 --- a/dialectid/__init__.py +++ b/dialectid/__init__.py @@ -20,4 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -__version__ = '0.0.1' \ No newline at end of file +__version__ = '0.0.1' + +from dialectid.text_repr import BoW \ No newline at end of file diff --git a/dialectid/tests/test_text_repr.py b/dialectid/tests/test_text_repr.py new file mode 100644 index 0000000..8aa8778 --- /dev/null +++ b/dialectid/tests/test_text_repr.py @@ -0,0 +1,34 @@ +# MIT License + +# Copyright (c) 2024 Eric Sadit Tellez Avila, Daniela Alejandra Moctezuma Ochoa, Luis Guillermo Ruiz Velazquez, Mario Graff Guerrero + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/ + + +from dialectid.text_repr import BoW + + +def test_bow(): + """Test BoW""" + from b4msa.textmodel import TextModel + + bow = BoW(lang='es') + assert isinstance(bow.bow, TextModel) + X = bow.transform(['Buenos dias']) \ No newline at end of file diff --git a/dialectid/tests/test_utils.py b/dialectid/tests/test_utils.py index c80a21d..10876a8 100644 --- a/dialectid/tests/test_utils.py +++ b/dialectid/tests/test_utils.py @@ -54,3 +54,10 @@ def test_countries(): for i in v: assert len(i) == 2 +def test_load_bow(): + """Test load_bow""" + + from microtc.utils import Counter + + c = utils.load_bow() + assert isinstance(c, Counter) \ No newline at end of file diff --git a/dialectid/text_repr.py b/dialectid/text_repr.py new file mode 100644 index 0000000..47806cc --- /dev/null +++ b/dialectid/text_repr.py @@ -0,0 +1,69 @@ +# MIT License + +# Copyright (c) 2024 Eric Sadit Tellez Avila, Daniela Alejandra Moctezuma Ochoa, Luis Guillermo Ruiz Velazquez, Mario Graff Guerrero + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from sklearn.svm import LinearSVC +from EvoMSA import BoW as EvoMSABoW +from EvoMSA.utils import b4msa_params +from b4msa.textmodel import TextModel +from microtc.weighting import TFIDF +from dialectid.utils import load_bow + + +class BoW(EvoMSABoW): + """BoW + + >>> from dialectid import BoW + >>> bow = BoW(lang='es') + >>> bow.transform(['Buenos dias', 'Disfruta dialectid']) + """ + + def __init__(self, pretrain: bool=True, + v1: bool=False, + estimator_class=LinearSVC, + estimator_kwargs=dict(dual=True, + class_weight='balanced'), + **kwargs): + assert pretrain + assert not v1 + super(BoW, self).__init__(pretrain=pretrain, + v1=v1, **kwargs) + + @property + def bow(self): + """BoW""" + + try: + bow = self._bow + except AttributeError: + freq = load_bow(lang=self.lang, + d=self.voc_size_exponent, + func=self.voc_selection) + params = b4msa_params(lang=self.lang, + dim=self._voc_size_exponent) + params.update(self.b4msa_kwargs) + bow = TextModel(**params) + tfidf = TFIDF() + tfidf.N = freq.update_calls + tfidf.word2id, tfidf.wordWeight = tfidf.counter2weight(freq) + bow.model = tfidf + self._bow = bow + return bow \ No newline at end of file diff --git a/dialectid/utils.py b/dialectid/utils.py index 0e62804..8bc1cbe 100644 --- a/dialectid/utils.py +++ b/dialectid/utils.py @@ -21,6 +21,13 @@ # SOFTWARE. # https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/ +from EvoMSA.utils import Download +from microtc.utils import Counter +from os.path import join, dirname, isdir, isfile +import gzip +import os + +BASEURL = 'https://github.com/INGEOTEC/dialectid/releases/download/data' COUNTRIES = {'es':['mx', 'cl', 'es', # Mexico (MX), Chile (CL), Spain (ES) 'ar', 'co', 'pe', # Argentina (AR), Colombia (CO), Peru (PE) @@ -91,5 +98,28 @@ 'zh':['cn', 'sg', 'hk', # China, Singapore, Hong Kong 'tw' # Taiwan ] - } - \ No newline at end of file + } + + +def load_bow(lang='es', d=17, func='most_common_by_type'): + """Load BoW model from dialectid""" + + def load(filename): + try: + with gzip.open(filename, 'rb') as fpt: + return str(fpt.read(), encoding='utf-8') + except Exception: + os.unlink(filename) + raise Exception(filename) + + lang = lang.lower().strip() + diroutput = join(dirname(__file__), 'models') + if not isdir(diroutput): + os.mkdir(diroutput) + filename = f'{lang}_bow_{func}_{d}.json.gz' + url = f'{BASEURL}/{filename}' + output = join(diroutput, filename) + if not isfile(output): + Download(url, output) + return Counter.fromjson(load(output)) + diff --git a/pyproject.toml b/pyproject.toml index 59551db..1a14217 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,10 @@ name = 'dialectid' dependencies = [ 'numpy', - 'scikit-learn>=1.3.0' + 'scikit-learn>=1.3.0', + 'microtc', + 'b4msa', + 'EvoMSA' ] dynamic = ['version'] From 64e7e106ebc6f13271c291280be9e43452b8342f Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Wed, 5 Jun 2024 10:39:44 -0500 Subject: [PATCH 2/2] Missing EvoMSA --- .github/workflows/pip.yaml | 4 ++-- .github/workflows/test.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pip.yaml b/.github/workflows/pip.yaml index f41cdd0..783f922 100644 --- a/.github/workflows/pip.yaml +++ b/.github/workflows/pip.yaml @@ -14,7 +14,7 @@ jobs: shell: bash -l {0} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v2 @@ -32,7 +32,7 @@ jobs: run: | conda install --yes pip pip install twine build - conda install --yes numpy scipy scikit-learn nose + conda install --yes numpy scipy scikit-learn nose evomsa python -m build - name: Pip if: ${{ runner.os == 'Linux' }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3680cd6..7c3fee3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -32,7 +32,7 @@ jobs: conda install --yes pip pip install coverage pip install coveralls - conda install --yes numpy scipy scikit-learn nose + conda install --yes numpy scipy scikit-learn nose evomsa python setup.py build_ext --inplace - name: Tests on Linux if: ${{ runner.os == 'Linux' }}