From ddf4f3c66863b41e73a403d572d9a56fdf7991a5 Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Fri, 21 Jun 2024 11:43:44 -0600 Subject: [PATCH 1/3] Subwords as parameter. --- dialectid/__init__.py | 2 +- dialectid/tests/test_text_repr.py | 15 ++++++++++++--- dialectid/text_repr.py | 4 ++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/dialectid/__init__.py b/dialectid/__init__.py index 6105538..796f9c9 100644 --- a/dialectid/__init__.py +++ b/dialectid/__init__.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -__version__ = '0.0.3' +__version__ = '0.0.4' from dialectid.text_repr import BoW from dialectid.model import DialectId \ No newline at end of file diff --git a/dialectid/tests/test_text_repr.py b/dialectid/tests/test_text_repr.py index d19bada..6592bb4 100644 --- a/dialectid/tests/test_text_repr.py +++ b/dialectid/tests/test_text_repr.py @@ -30,9 +30,18 @@ def test_bow(): """Test BoW""" from b4msa.textmodel import TextModel - bow = BoW(lang='es') + bow = BoW(lang='es', voc_size_exponent=13) assert isinstance(bow.bow, TextModel) X = bow.transform(['Buenos dias']) - bow2 = BoW(lang='es', loc='mx') + bow2 = BoW(lang='es', loc='mx', voc_size_exponent=13) X2 = bow2.transform(['Buenos dias']) - assert (X - X2).sum() != 0 \ No newline at end of file + assert (X - X2).sum() != 0 + + +def test_subwords(): + """Test subwords""" + + bow = BoW(lang='es', voc_size_exponent=13, + subwords=True) + bow.transform(['Hola']) + \ No newline at end of file diff --git a/dialectid/text_repr.py b/dialectid/text_repr.py index ce6da57..e58fb88 100644 --- a/dialectid/text_repr.py +++ b/dialectid/text_repr.py @@ -39,10 +39,14 @@ def __init__(self, pretrain: bool=True, v1: bool=False, estimator_kwargs: dict=None, loc: str=None, + subwords: bool=False, **kwargs): assert pretrain assert not v1 self._bow = None + if subwords: + assert loc is None + loc = 'qgrams' self.loc = loc if estimator_kwargs is None: estimator_kwargs = {'dual': True, 'class_weight': 'balanced'} From 3eb1dadb17052429b370f1f13de035793171433d Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Fri, 2 Aug 2024 08:00:57 -0500 Subject: [PATCH 2/3] subwords --- dialectid/model.py | 10 ++++++++-- dialectid/tests/test_model.py | 16 ++++++++++++---- dialectid/utils.py | 7 +++++-- pyproject.toml | 22 +++++++++++++++++++++- 4 files changed, 46 insertions(+), 9 deletions(-) diff --git a/dialectid/model.py b/dialectid/model.py index 4c31800..774b4be 100644 --- a/dialectid/model.py +++ b/dialectid/model.py @@ -32,6 +32,7 @@ class DialectId: """DialectId""" lang: str='es' voc_size_exponent: int=15 + subwords: bool=True @property def bow(self): @@ -43,8 +44,12 @@ def bow(self): path = BOW[self.lang].split('.') module = '.'.join(path[:-1]) text_repr = importlib.import_module(module) + kwargs = {} + if module != 'EvoMSA.text_repr': + kwargs = dict(subwords=self.subwords) _ = getattr(text_repr, path[-1])(lang=self.lang, - voc_size_exponent=self.voc_size_exponent) + voc_size_exponent=self.voc_size_exponent, + **kwargs) self._bow = _ return self._bow @@ -55,7 +60,8 @@ def weights(self): return self._weights except AttributeError: self._weights = load_dialectid(self.lang, - self.voc_size_exponent) + self.voc_size_exponent, + self.subwords) return self._weights @property diff --git a/dialectid/tests/test_model.py b/dialectid/tests/test_model.py index baf316b..e2acb29 100644 --- a/dialectid/tests/test_model.py +++ b/dialectid/tests/test_model.py @@ -28,7 +28,7 @@ def test_DialectId(): from dialectid.model import DialectId from dialectid import BoW - dialectid = DialectId(voc_size_exponent=15) + dialectid = DialectId(voc_size_exponent=15, subwords=False) assert dialectid.lang == 'es' and dialectid.voc_size_exponent == 15 assert isinstance(dialectid.bow, BoW) @@ -38,7 +38,7 @@ def test_DialectId_df(): from dialectid.model import DialectId - dialectid = DialectId(voc_size_exponent=15) + dialectid = DialectId(voc_size_exponent=15, subwords=False) hy = dialectid.decision_function('comiendo tacos') assert hy.shape == (1, 20) assert hy.argmax(axis=1)[0] == 0 @@ -49,7 +49,7 @@ def test_countries(): from dialectid.model import DialectId - dialectid = DialectId(voc_size_exponent=15) + dialectid = DialectId(voc_size_exponent=15, subwords=False) assert len(dialectid.countries) == 20 assert dialectid.countries[0] == 'mx' @@ -59,10 +59,18 @@ def test_predict(): from dialectid.model import DialectId - dialectid = DialectId(voc_size_exponent=15) + dialectid = DialectId(voc_size_exponent=15, subwords=False) countries = dialectid.predict('comiendo tacos') assert countries[0] == 'mx' countries = dialectid.predict(['comiendo tacos', 'tomando vino']) assert countries.shape == (2, ) + +def test_DialectId_subwords(): + """Test DialectId subwords""" + + from dialectid.model import DialectId + dialectid = DialectId(voc_size_exponent=15) + countries = dialectid.predict('comiendo tacos') + assert countries[0] == 'mx' diff --git a/dialectid/utils.py b/dialectid/utils.py index 2932c3b..5e1c5cf 100644 --- a/dialectid/utils.py +++ b/dialectid/utils.py @@ -146,13 +146,16 @@ def load(filename): return data -def load_dialectid(lang, dim): +def load_dialectid(lang, dim, subwords): """Load url""" diroutput = join(dirname(__file__), 'models') if not isdir(diroutput): os.mkdir(diroutput) - filename = f'dialectid_{lang}_{dim}.json.gz' + if subwords: + filename = f'dialectid_subwords_{lang}_{dim}.json.gz' + else: + filename = f'dialectid_{lang}_{dim}.json.gz' output = join(diroutput, filename) if not isfile(output): Download(f'{BASEURL}/{filename}', output) diff --git a/pyproject.toml b/pyproject.toml index 1a14217..0d035e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,7 @@ [project] name = 'dialectid' +description = "Set of algorithms to detect the dialect of a given text" +readme = "README.rst" dependencies = [ 'numpy', 'scikit-learn>=1.3.0', @@ -8,9 +10,27 @@ dependencies = [ 'EvoMSA' ] dynamic = ['version'] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Artificial Intelligence" +] + + [tool.setuptools.dynamic] version = {attr = 'dialectid.__version__'} [tool.setuptools] -packages = ['dialectid', 'dialectid.tests'] \ No newline at end of file +packages = ['dialectid', 'dialectid.tests'] + +[project.urls] +Homepage = "https://ingeotec.github.io/dialectid" +Repository = "https://github.com/INGEOTEC/dialectid" +Issues = "https://github.com/INGEOTEC/dialectid/issues" \ No newline at end of file From 7d67c0705bf86b2037a0a4880fa16864af196752 Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Fri, 2 Aug 2024 09:43:57 -0600 Subject: [PATCH 3/3] Missing argument --- dialectid/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dialectid/utils.py b/dialectid/utils.py index 5e1c5cf..d8a9013 100644 --- a/dialectid/utils.py +++ b/dialectid/utils.py @@ -146,7 +146,7 @@ def load(filename): return data -def load_dialectid(lang, dim, subwords): +def load_dialectid(lang, dim, subwords=False): """Load url""" diroutput = join(dirname(__file__), 'models')