From ddf4f3c66863b41e73a403d572d9a56fdf7991a5 Mon Sep 17 00:00:00 2001
From: Mario Graff <mgraffg@ieee.org>
Date: Fri, 21 Jun 2024 11:43:44 -0600
Subject: [PATCH 1/3] Subwords as parameter.

---
 dialectid/__init__.py             |  2 +-
 dialectid/tests/test_text_repr.py | 15 ++++++++++++---
 dialectid/text_repr.py            |  4 ++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/dialectid/__init__.py b/dialectid/__init__.py
index 6105538..796f9c9 100644
--- a/dialectid/__init__.py
+++ b/dialectid/__init__.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-__version__ = '0.0.3'
+__version__ = '0.0.4'
 
 from dialectid.text_repr import BoW
 from dialectid.model import DialectId
\ No newline at end of file
diff --git a/dialectid/tests/test_text_repr.py b/dialectid/tests/test_text_repr.py
index d19bada..6592bb4 100644
--- a/dialectid/tests/test_text_repr.py
+++ b/dialectid/tests/test_text_repr.py
@@ -30,9 +30,18 @@ def test_bow():
     """Test BoW"""
     from b4msa.textmodel import TextModel
 
-    bow = BoW(lang='es')
+    bow = BoW(lang='es', voc_size_exponent=13)
     assert isinstance(bow.bow, TextModel)
     X = bow.transform(['Buenos dias'])
-    bow2 = BoW(lang='es', loc='mx')
+    bow2 = BoW(lang='es', loc='mx', voc_size_exponent=13)
     X2 = bow2.transform(['Buenos dias'])
-    assert (X - X2).sum() != 0
\ No newline at end of file
+    assert (X - X2).sum() != 0
+
+
+def test_subwords():
+    """Test subwords"""
+
+    bow = BoW(lang='es', voc_size_exponent=13,
+              subwords=True)
+    bow.transform(['Hola'])
+    
\ No newline at end of file
diff --git a/dialectid/text_repr.py b/dialectid/text_repr.py
index ce6da57..e58fb88 100644
--- a/dialectid/text_repr.py
+++ b/dialectid/text_repr.py
@@ -39,10 +39,14 @@ def __init__(self, pretrain: bool=True,
                  v1: bool=False,
                  estimator_kwargs: dict=None,
                  loc: str=None,
+                 subwords: bool=False,
                  **kwargs):
         assert pretrain
         assert not v1
         self._bow = None
+        if subwords:
+            assert loc is None
+            loc = 'qgrams'
         self.loc = loc
         if estimator_kwargs is None:
             estimator_kwargs = {'dual': True, 'class_weight': 'balanced'}

From 3eb1dadb17052429b370f1f13de035793171433d Mon Sep 17 00:00:00 2001
From: Mario Graff <mgraffg@ieee.org>
Date: Fri, 2 Aug 2024 08:00:57 -0500
Subject: [PATCH 2/3] subwords

---
 dialectid/model.py            | 10 ++++++++--
 dialectid/tests/test_model.py | 16 ++++++++++++----
 dialectid/utils.py            |  7 +++++--
 pyproject.toml                | 22 +++++++++++++++++++++-
 4 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/dialectid/model.py b/dialectid/model.py
index 4c31800..774b4be 100644
--- a/dialectid/model.py
+++ b/dialectid/model.py
@@ -32,6 +32,7 @@ class DialectId:
     """DialectId"""
     lang: str='es'
     voc_size_exponent: int=15
+    subwords: bool=True
 
     @property
     def bow(self):
@@ -43,8 +44,12 @@ def bow(self):
             path = BOW[self.lang].split('.')
             module = '.'.join(path[:-1])
             text_repr = importlib.import_module(module)
+            kwargs = {}
+            if module != 'EvoMSA.text_repr':
+                kwargs = dict(subwords=self.subwords)
             _ = getattr(text_repr, path[-1])(lang=self.lang,
-                                             voc_size_exponent=self.voc_size_exponent)
+                                             voc_size_exponent=self.voc_size_exponent,
+                                             **kwargs)
             self._bow = _
         return self._bow
 
@@ -55,7 +60,8 @@ def weights(self):
             return self._weights
         except AttributeError:
             self._weights = load_dialectid(self.lang,
-                                           self.voc_size_exponent)
+                                           self.voc_size_exponent,
+                                           self.subwords)
         return self._weights
     
     @property
diff --git a/dialectid/tests/test_model.py b/dialectid/tests/test_model.py
index baf316b..e2acb29 100644
--- a/dialectid/tests/test_model.py
+++ b/dialectid/tests/test_model.py
@@ -28,7 +28,7 @@ def test_DialectId():
     from dialectid.model import DialectId
     from dialectid import BoW
 
-    dialectid = DialectId(voc_size_exponent=15)
+    dialectid = DialectId(voc_size_exponent=15, subwords=False)
     assert dialectid.lang == 'es' and dialectid.voc_size_exponent == 15
     assert isinstance(dialectid.bow, BoW)
 
@@ -38,7 +38,7 @@ def test_DialectId_df():
 
     from dialectid.model import DialectId
 
-    dialectid = DialectId(voc_size_exponent=15)
+    dialectid = DialectId(voc_size_exponent=15, subwords=False)
     hy = dialectid.decision_function('comiendo tacos')
     assert hy.shape == (1, 20)
     assert hy.argmax(axis=1)[0] == 0
@@ -49,7 +49,7 @@ def test_countries():
 
     from dialectid.model import DialectId
 
-    dialectid = DialectId(voc_size_exponent=15)
+    dialectid = DialectId(voc_size_exponent=15, subwords=False)
     assert len(dialectid.countries) == 20
     assert dialectid.countries[0] == 'mx'
 
@@ -59,10 +59,18 @@ def test_predict():
 
     from dialectid.model import DialectId
 
-    dialectid = DialectId(voc_size_exponent=15)
+    dialectid = DialectId(voc_size_exponent=15, subwords=False)
     countries = dialectid.predict('comiendo tacos')
     assert countries[0] == 'mx'
     countries = dialectid.predict(['comiendo tacos',
                                    'tomando vino'])
     assert countries.shape == (2, )
 
+
+def test_DialectId_subwords():
+    """Test DialectId subwords"""
+
+    from dialectid.model import DialectId
+    dialectid = DialectId(voc_size_exponent=15)
+    countries = dialectid.predict('comiendo tacos')
+    assert countries[0] == 'mx'    
diff --git a/dialectid/utils.py b/dialectid/utils.py
index 2932c3b..5e1c5cf 100644
--- a/dialectid/utils.py
+++ b/dialectid/utils.py
@@ -146,13 +146,16 @@ def load(filename):
     return data
 
 
-def load_dialectid(lang, dim):
+def load_dialectid(lang, dim, subwords):
     """Load url"""
 
     diroutput = join(dirname(__file__), 'models')
     if not isdir(diroutput):
         os.mkdir(diroutput)
-    filename = f'dialectid_{lang}_{dim}.json.gz'
+    if subwords:
+        filename = f'dialectid_subwords_{lang}_{dim}.json.gz'
+    else:
+        filename = f'dialectid_{lang}_{dim}.json.gz'
     output = join(diroutput, filename)
     if not isfile(output):
         Download(f'{BASEURL}/{filename}', output)
diff --git a/pyproject.toml b/pyproject.toml
index 1a14217..0d035e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,7 @@
 [project]
 name = 'dialectid'
+description = "Set of algorithms to detect the dialect of a given text"
+readme = "README.rst"
 dependencies = [
     'numpy',
     'scikit-learn>=1.3.0',
@@ -8,9 +10,27 @@ dependencies = [
     'EvoMSA'
 ]
 dynamic = ['version']
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence"
+]
+
+
 
 [tool.setuptools.dynamic]
 version = {attr = 'dialectid.__version__'}
 
 [tool.setuptools]
-packages = ['dialectid', 'dialectid.tests']
\ No newline at end of file
+packages = ['dialectid', 'dialectid.tests']
+
+[project.urls]
+Homepage = "https://ingeotec.github.io/dialectid"
+Repository = "https://github.com/INGEOTEC/dialectid"
+Issues = "https://github.com/INGEOTEC/dialectid/issues"
\ No newline at end of file

From 7d67c0705bf86b2037a0a4880fa16864af196752 Mon Sep 17 00:00:00 2001
From: Mario Graff <mgraffg@ieee.org>
Date: Fri, 2 Aug 2024 09:43:57 -0600
Subject: [PATCH 3/3] Missing argument

---
 dialectid/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dialectid/utils.py b/dialectid/utils.py
index 5e1c5cf..d8a9013 100644
--- a/dialectid/utils.py
+++ b/dialectid/utils.py
@@ -146,7 +146,7 @@ def load(filename):
     return data
 
 
-def load_dialectid(lang, dim, subwords):
+def load_dialectid(lang, dim, subwords=False):
     """Load url"""
 
     diroutput = join(dirname(__file__), 'models')