diff --git a/tests/test_representation.py b/tests/test_representation.py index dacf99cc..d4b73e6d 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -20,28 +20,28 @@ def load_tests(loader, tests, ignore): class TestRepresentation(PandasTestCase): """ - Term Frequency. + Count. """ - def test_term_frequency_single_document(self): + def test_count_single_document(self): s = pd.Series("a b c c") s_true = pd.Series([[1, 1, 2]]) - self.assertEqual(representation.term_frequency(s), s_true) + self.assertEqual(representation.count(s), s_true) - def test_term_frequency_multiple_documents(self): + def test_count_multiple_documents(self): s = pd.Series(["doc_one", "doc_two"]) s_true = pd.Series([[1, 0], [0, 1]]) - self.assertEqual(representation.term_frequency(s), s_true) + self.assertEqual(representation.count(s), s_true) - def test_term_frequency_not_lowercase(self): + def test_count_not_lowercase(self): s = pd.Series(["one ONE"]) s_true = pd.Series([[1, 1]]) - self.assertEqual(representation.term_frequency(s), s_true) + self.assertEqual(representation.count(s), s_true) - def test_term_frequency_punctuation_are_kept(self): + def test_count_punctuation_are_kept(self): s = pd.Series(["one !"]) s_true = pd.Series([[1, 1]]) - self.assertEqual(representation.term_frequency(s), s_true) + self.assertEqual(representation.count(s), s_true) """ TF-IDF @@ -100,3 +100,27 @@ def test_most_similar_raise_with_not_in_index(self): to = "two" with self.assertRaisesRegex(ValueError, r"index"): representation.most_similar(s_embed, to) + + """ + Term Frequency + """ + + def test_term_frequency_single_document(self): + s = pd.Series("a b c c") + s_true = pd.Series([[1, 1, 2]]) + self.assertEqual(representation.term_frequency(s), s_true) + + def test_term_frequency_multiple_documents(self): + s = pd.Series(["doc_one", "doc_two"]) + s_true = pd.Series([[1, 1]]) + self.assertEqual(representation.term_frequency(s), s_true) + + def test_term_frequency_not_lowercase(self): + s = pd.Series(["one ONE"]) + s_true = pd.Series([[1, 1]]) + self.assertEqual(representation.term_frequency(s), s_true) + + def test_term_frequency_punctuation_are_kept(self): + s = pd.Series(["one !"]) + s_true = pd.Series([[1, 1]]) + self.assertEqual(representation.term_frequency(s), s_true) diff --git a/texthero/representation.py b/texthero/representation.py index c4c477dc..98d66d18 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -3,6 +3,7 @@ """ import pandas as pd +import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.manifold import TSNE @@ -24,11 +25,9 @@ """ -def term_frequency( - s: pd.Series, max_features: Optional[int] = None, return_feature_names=False -): +def count(s: pd.Series, max_features: Optional[int] = None, return_feature_names=False): """ - Represent a text-based Pandas Series using term_frequency. + Represent a text-based Pandas Series using count. Parameters ---------- @@ -36,7 +35,7 @@ def term_frequency( max_features : int, optional Maximum number of features to keep. return_features_names : Boolean, False by Default - If True, return a tuple (*term_frequency_series*, *features_names*) + If True, return a tuple (*count_series*, *features_names*) Examples @@ -44,7 +43,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) - >>> hero.term_frequency(s) + >>> hero.count(s) 0 [1, 1, 0] 1 [1, 0, 1] dtype: object @@ -54,7 +53,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) - >>> hero.term_frequency(s, return_feature_names=True) + >>> hero.count(s, return_feature_names=True) (0 [1, 1, 0] 1 [1, 0, 1] dtype: object, ['Sentence', 'one', 'two']) @@ -72,6 +71,56 @@ def term_frequency( return s +def term_frequency( + s: pd.Series, max_features: Optional[int] = None, return_feature_names=False +): + + """ + Represent a text-based Pandas Series using term frequency. + + Parameters + ---------- + s : Pandas Series + max_features : int, optional + Maximum number of features to keep. + return_features_names : Boolean, False by Default + If True, return a tuple (*count_series*, *features_names*) + + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Sentence one", "Sentence two"]) + >>> hero.term_frequency(s) + 0 [2, 1, 1] + dtype: object + + To return the features_names: + + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Sentence one", "Sentence two"]) + >>> hero.term_frequency(s, return_feature_names=True) + (0 [2, 1, 1] + dtype: object, ['Sentence', 'one', 'two']) + + """ + + tf = CountVectorizer( + max_features=max_features, lowercase=False, token_pattern="\S+" + ) + + series = np.asarray(tf.fit_transform(s).sum(axis=0)) + + s = pd.Series(series.tolist(), index=[0]) + + if return_feature_names: + return (s, tf.get_feature_names()) + else: + return s + + def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False): """ Represent a text-based Pandas Series using TF-IDF.