Skip to content

Commit

Permalink
Formatted
Browse files Browse the repository at this point in the history
  • Loading branch information
ishanarora04 committed Jul 15, 2020
1 parent 452c93c commit caee34a
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 16 deletions.
42 changes: 33 additions & 9 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,28 @@ def load_tests(loader, tests, ignore):

class TestRepresentation(PandasTestCase):
"""
Term Frequency.
Count.
"""

def test_term_frequency_single_document(self):
def test_count_single_document(self):
s = pd.Series("a b c c")
s_true = pd.Series([[1, 1, 2]])
self.assertEqual(representation.term_frequency(s), s_true)
self.assertEqual(representation.count(s), s_true)

def test_term_frequency_multiple_documents(self):
def test_count_multiple_documents(self):
s = pd.Series(["doc_one", "doc_two"])
s_true = pd.Series([[1, 0], [0, 1]])
self.assertEqual(representation.term_frequency(s), s_true)
self.assertEqual(representation.count(s), s_true)

def test_term_frequency_not_lowercase(self):
def test_count_not_lowercase(self):
s = pd.Series(["one ONE"])
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s), s_true)
self.assertEqual(representation.count(s), s_true)

def test_term_frequency_punctuation_are_kept(self):
def test_count_punctuation_are_kept(self):
s = pd.Series(["one !"])
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s), s_true)
self.assertEqual(representation.count(s), s_true)

"""
TF-IDF
Expand Down Expand Up @@ -100,3 +100,27 @@ def test_most_similar_raise_with_not_in_index(self):
to = "two"
with self.assertRaisesRegex(ValueError, r"index"):
representation.most_similar(s_embed, to)

"""
Term Frequency
"""

def test_term_frequency_single_document(self):
s = pd.Series("a b c c")
s_true = pd.Series([[1, 1, 2]])
self.assertEqual(representation.term_frequency(s), s_true)

def test_term_frequency_multiple_documents(self):
s = pd.Series(["doc_one", "doc_two"])
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s), s_true)

def test_term_frequency_not_lowercase(self):
s = pd.Series(["one ONE"])
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s), s_true)

def test_term_frequency_punctuation_are_kept(self):
s = pd.Series(["one !"])
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s), s_true)
63 changes: 56 additions & 7 deletions texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
Expand All @@ -24,27 +25,25 @@
"""


def term_frequency(
s: pd.Series, max_features: Optional[int] = None, return_feature_names=False
):
def count(s: pd.Series, max_features: Optional[int] = None, return_feature_names=False):
"""
Represent a text-based Pandas Series using term_frequency.
Represent a text-based Pandas Series using count.
Parameters
----------
s : Pandas Series
max_features : int, optional
Maximum number of features to keep.
return_features_names : Boolean, False by Default
If True, return a tuple (*term_frequency_series*, *features_names*)
If True, return a tuple (*count_series*, *features_names*)
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
>>> hero.term_frequency(s)
>>> hero.count(s)
0 [1, 1, 0]
1 [1, 0, 1]
dtype: object
Expand All @@ -54,7 +53,7 @@ def term_frequency(
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
>>> hero.term_frequency(s, return_feature_names=True)
>>> hero.count(s, return_feature_names=True)
(0 [1, 1, 0]
1 [1, 0, 1]
dtype: object, ['Sentence', 'one', 'two'])
Expand All @@ -72,6 +71,56 @@ def term_frequency(
return s


def term_frequency(
s: pd.Series, max_features: Optional[int] = None, return_feature_names=False
):

"""
Represent a text-based Pandas Series using term frequency.
Parameters
----------
s : Pandas Series
max_features : int, optional
Maximum number of features to keep.
return_features_names : Boolean, False by Default
If True, return a tuple (*count_series*, *features_names*)
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
>>> hero.term_frequency(s)
0 [2, 1, 1]
dtype: object
To return the features_names:
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
>>> hero.term_frequency(s, return_feature_names=True)
(0 [2, 1, 1]
dtype: object, ['Sentence', 'one', 'two'])
"""

tf = CountVectorizer(
max_features=max_features, lowercase=False, token_pattern="\S+"
)

series = np.asarray(tf.fit_transform(s).sum(axis=0))

s = pd.Series(series.tolist(), index=[0])

if return_feature_names:
return (s, tf.get_feature_names())
else:
return s


def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False):
"""
Represent a text-based Pandas Series using TF-IDF.
Expand Down

0 comments on commit caee34a

Please sign in to comment.