Formatted

jbesomi · Jul 15, 2020 · caee34a · caee34a
1 parent 452c93c
commit caee34a
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 16 deletions.
diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -20,28 +20,28 @@ def load_tests(loader, tests, ignore):
 
 class TestRepresentation(PandasTestCase):
     """
-    Term Frequency.
+    Count.
     """
 
-    def test_term_frequency_single_document(self):
+    def test_count_single_document(self):
         s = pd.Series("a b c c")
         s_true = pd.Series([[1, 1, 2]])
-        self.assertEqual(representation.term_frequency(s), s_true)
+        self.assertEqual(representation.count(s), s_true)
 
-    def test_term_frequency_multiple_documents(self):
+    def test_count_multiple_documents(self):
         s = pd.Series(["doc_one", "doc_two"])
         s_true = pd.Series([[1, 0], [0, 1]])
-        self.assertEqual(representation.term_frequency(s), s_true)
+        self.assertEqual(representation.count(s), s_true)
 
-    def test_term_frequency_not_lowercase(self):
+    def test_count_not_lowercase(self):
         s = pd.Series(["one ONE"])
         s_true = pd.Series([[1, 1]])
-        self.assertEqual(representation.term_frequency(s), s_true)
+        self.assertEqual(representation.count(s), s_true)
 
-    def test_term_frequency_punctuation_are_kept(self):
+    def test_count_punctuation_are_kept(self):
         s = pd.Series(["one !"])
         s_true = pd.Series([[1, 1]])
-        self.assertEqual(representation.term_frequency(s), s_true)
+        self.assertEqual(representation.count(s), s_true)
 
     """
     TF-IDF
@@ -100,3 +100,27 @@ def test_most_similar_raise_with_not_in_index(self):
         to = "two"
         with self.assertRaisesRegex(ValueError, r"index"):
             representation.most_similar(s_embed, to)
+
+    """
+        Term Frequency
+    """
+
+    def test_term_frequency_single_document(self):
+        s = pd.Series("a b c c")
+        s_true = pd.Series([[1, 1, 2]])
+        self.assertEqual(representation.term_frequency(s), s_true)
+
+    def test_term_frequency_multiple_documents(self):
+        s = pd.Series(["doc_one", "doc_two"])
+        s_true = pd.Series([[1, 1]])
+        self.assertEqual(representation.term_frequency(s), s_true)
+
+    def test_term_frequency_not_lowercase(self):
+        s = pd.Series(["one ONE"])
+        s_true = pd.Series([[1, 1]])
+        self.assertEqual(representation.term_frequency(s), s_true)
+
+    def test_term_frequency_punctuation_are_kept(self):
+        s = pd.Series(["one !"])
+        s_true = pd.Series([[1, 1]])
+        self.assertEqual(representation.term_frequency(s), s_true)
diff --git a/texthero/representation.py b/texthero/representation.py
@@ -3,6 +3,7 @@
 """
 
 import pandas as pd
+import numpy as np
 
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.manifold import TSNE
@@ -24,27 +25,25 @@
 """
 
 
-def term_frequency(
-    s: pd.Series, max_features: Optional[int] = None, return_feature_names=False
-):
+def count(s: pd.Series, max_features: Optional[int] = None, return_feature_names=False):
     """
-    Represent a text-based Pandas Series using term_frequency.
+    Represent a text-based Pandas Series using count.
 
     Parameters
     ----------
     s : Pandas Series
     max_features : int, optional
         Maximum number of features to keep.
     return_features_names : Boolean, False by Default
-        If True, return a tuple (*term_frequency_series*, *features_names*)
+        If True, return a tuple (*count_series*, *features_names*)
 
 
     Examples
     --------
     >>> import texthero as hero
     >>> import pandas as pd
     >>> s = pd.Series(["Sentence one", "Sentence two"])
-    >>> hero.term_frequency(s)
+    >>> hero.count(s)
     0    [1, 1, 0]
     1    [1, 0, 1]
     dtype: object
@@ -54,7 +53,7 @@ def term_frequency(
     >>> import texthero as hero
     >>> import pandas as pd
     >>> s = pd.Series(["Sentence one", "Sentence two"])
-    >>> hero.term_frequency(s, return_feature_names=True)
+    >>> hero.count(s, return_feature_names=True)
     (0    [1, 1, 0]
     1    [1, 0, 1]
     dtype: object, ['Sentence', 'one', 'two'])
@@ -72,6 +71,56 @@ def term_frequency(
         return s
 
 
+def term_frequency(
+    s: pd.Series, max_features: Optional[int] = None, return_feature_names=False
+):
+
+    """
+    Represent a text-based Pandas Series using term frequency.
+
+    Parameters
+    ----------
+    s : Pandas Series
+    max_features : int, optional
+        Maximum number of features to keep.
+    return_features_names : Boolean, False by Default
+        If True, return a tuple (*count_series*, *features_names*)
+
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["Sentence one", "Sentence two"])
+    >>> hero.term_frequency(s)
+    0    [2, 1, 1]
+    dtype: object
+    
+    To return the features_names:
+    
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["Sentence one", "Sentence two"])
+    >>> hero.term_frequency(s, return_feature_names=True)
+    (0    [2, 1, 1]
+    dtype: object, ['Sentence', 'one', 'two'])
+
+    """
+
+    tf = CountVectorizer(
+        max_features=max_features, lowercase=False, token_pattern="\S+"
+    )
+
+    series = np.asarray(tf.fit_transform(s).sum(axis=0))
+
+    s = pd.Series(series.tolist(), index=[0])
+
+    if return_feature_names:
+        return (s, tf.get_feature_names())
+    else:
+        return s
+
+
 def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False):
     """
     Represent a text-based Pandas Series using TF-IDF.