diff --git a/.travis.yml b/.travis.yml index 11de47d..c76284b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,4 +26,4 @@ install: # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only script: - black --check . - - python -m unittest discover -s tests -t . || python3 -m unittest discover -s tests -t . \ No newline at end of file + - python -m unittest discover -s tests -t . || python3 -m unittest discover -s tests -t . diff --git a/setup.cfg b/setup.cfg index e9010a8..42f2b20 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,13 +29,13 @@ python_requires = >=3.6.1 install_requires = numpy>=1.17 scikit-learn>=0.22 - spacy>=2.2.2 + spacy<3.0.0 tqdm>=4.3 nltk>=3.3 plotly>=4.2.0 pandas>=1.0.2 wordcloud>=1.5.0 - gensim>=3.6.0 + gensim>=3.6.0,<4.0 matplotlib>=3.1.0 # TODO pick the correct version. [options.extras_require] diff --git a/tests/test_representation.py b/tests/test_representation.py index 1b9fd03..896cb38 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -222,6 +222,29 @@ def test_dim_reduction_and_clustering_with_vector_series_input( else: result_s = test_function(vector_s, random_state=42) + # Binary categories: also test if it equals with + # the category labels inverted (e.g. [0, 1, 0] instead + # of [1, 0, 1], which makes no difference functionally) + if pd.api.types.is_categorical_dtype(result_s): + if len(result_s.cat.categories) == 2 and all( + result_s.cat.categories == [0, 1] + ): + try: + result_s_inverted = result_s.apply(lambda category: 1 - category) + pd.testing.assert_series_equal( + s_true, + result_s_inverted, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + check_categorical=False, + ) + return + # inverted comparison fails -> continue to normal comparison + except AssertionError: + pass + pd.testing.assert_series_equal( s_true, result_s, @@ -248,6 +271,29 @@ def test_dim_reduction_and_clustering_with_dataframe_input( else: result_s = test_function(df, random_state=42) + # Binary categories: also test if it equals with + # the category labels inverted (e.g. [0, 1, 0] instead + # of [1, 0, 1], which makes no difference functionally) + if pd.api.types.is_categorical_dtype(result_s): + if len(result_s.cat.categories) == 2 and all( + result_s.cat.categories == [0, 1] + ): + try: + result_s_inverted = result_s.apply(lambda category: 1 - category) + pd.testing.assert_series_equal( + s_true, + result_s_inverted, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + check_categorical=False, + ) + return + # inverted comparison fails -> continue to normal comparison + except AssertionError: + pass + pd.testing.assert_series_equal( s_true, result_s, @@ -255,6 +301,7 @@ def test_dim_reduction_and_clustering_with_dataframe_input( rtol=0.1, atol=0.1, check_category_order=False, + check_categorical=False, ) def test_normalize_DataFrame_also_as_output(self): diff --git a/texthero/_types.py b/texthero/_types.py index 1612510..0515a6b 100644 --- a/texthero/_types.py +++ b/texthero/_types.py @@ -70,7 +70,7 @@ def tfidf(s: TokenSeries) -> DataFrame: # This class is mainly for documentation in the docstring. -class HeroTypes(pd.Series, pd.DataFrame): +class HeroTypes: """ Hero Series Types ================= diff --git a/texthero/representation.py b/texthero/representation.py index 8bbfead..1969985 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -632,7 +632,7 @@ def kmeans( >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( ... hero.term_frequency ... ) - >>> hero.kmeans(s, n_clusters=2, random_state=42) + >>> hero.kmeans(s, n_clusters=2, random_state=42) # doctest: +SKIP 0 1 1 0 2 1