Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HeroTypes in Representation; DataFrame in _types #157

Merged
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
fa342a9
added MultiIndex DF support
mk2510 Aug 18, 2020
59a9f8c
beginning with tests
henrifroese Aug 19, 2020
19c52de
implemented correct sparse support
mk2510 Aug 19, 2020
66e566c
Merge branch 'master_upstream' into change_representation_to_multicolumn
mk2510 Aug 21, 2020
41f55a8
added back list() and rm .tolist()
mk2510 Aug 21, 2020
217611a
rm .tolist() and added list()
mk2510 Aug 21, 2020
6a3b56d
Adopted the test to the new dataframes
mk2510 Aug 21, 2020
b8ff561
wrong format
mk2510 Aug 21, 2020
e3af2f9
Address most review comments.
henrifroese Aug 21, 2020
77ad80e
Add more unittests for representation
henrifroese Aug 21, 2020
f7eb7c3
- Update _types.py with DocumentTermDF
henrifroese Aug 22, 2020
4937a4f
Fix DocumentTermDF example DataFrame column names
henrifroese Aug 22, 2020
e2768b5
implemented the suggested changes
mk2510 Sep 4, 2020
b09f624
fixed messy docstring
mk2510 Sep 4, 2020
508c361
fix black issues
mk2510 Sep 4, 2020
75e955f
fix formatting
mk2510 Sep 4, 2020
9ca244d
begin switch away from multiindex
henrifroese Sep 4, 2020
4ebc266
Merge remote-tracking branch 'origin/change_representation_to_multico…
henrifroese Sep 4, 2020
559a7bd
Finish switch from DocumentTermDF to MatrixDF
henrifroese Sep 4, 2020
75a999c
changed Dataframe name from MatrixDF to DataFrame
mk2510 Sep 4, 2020
a38a32b
merge master
henrifroese Sep 12, 2020
c7b0ece
apply stash
henrifroese Sep 12, 2020
4aeec2a
finish switch to DataFrame type
henrifroese Sep 12, 2020
a304413
Merge remote-tracking branch 'origin/Hero_Types_in_Representation' in…
henrifroese Sep 12, 2020
8d49bff
merge remote
henrifroese Sep 12, 2020
85076b8
Remove check for nlevels
henrifroese Sep 12, 2020
91ed11a
incorporate suggested changes
henrifroese Sep 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: PATH=/c/Python38:/c/Python38/Scripts:$PATH
install:
- pip3 install --upgrade pip # all three OSes agree about 'pip3'
- pip3 install black
- pip3 install black==19.10b0
- pip3 install ".[dev]" .
# 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows
# 'python3' is a 'command not found' error on Windows but 'py' works on Windows only
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ install_requires =
# TODO pick the correct version.
[options.extras_require]
dev =
black>=19.10b0
black==19.10b0
pytest>=4.0.0
Sphinx>=3.0.3
sphinx-markdown-builder>=0.5.4
Expand Down
18 changes: 3 additions & 15 deletions tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,9 @@
]

test_cases_representation = [
[
"count",
lambda x: representation.flatten(representation.count(x)),
(s_tokenized_lists,),
],
[
"term_frequency",
lambda x: representation.flatten(representation.term_frequency(x)),
(s_tokenized_lists,),
],
[
"tfidf",
lambda x: representation.flatten(representation.tfidf(x)),
(s_tokenized_lists,),
],
["count", representation.count, (s_tokenized_lists,),],
["term_frequency", representation.term_frequency, (s_tokenized_lists,),],
["tfidf", representation.tfidf, (s_tokenized_lists,),],
["pca", representation.pca, (s_numeric_lists, 0)],
["nmf", representation.nmf, (s_numeric_lists,)],
["tsne", representation.tsne, (s_numeric_lists,)],
Expand Down
266 changes: 150 additions & 116 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,39 +50,114 @@ def _tfidf(term, corpus, document_index):
[["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7]
)

s_tokenized_output_index = pd.MultiIndex.from_tuples(
[(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")],
)

s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples(
[(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")],
)

s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)
s_tokenized_output_index = pd.Index([0, 1])

s_tokenized_output_index_noncontinous = pd.Index([5, 7])

test_cases_vectorization = [
# format: [function_name, function, correct output for tokenized input above, dtype of output]
["count", representation.count, [1, 1, 2, 2, 1, 1], "int"],
# format: [function_name, function, correct output for tokenized input above]
[
"count",
representation.count,
pd.DataFrame(
[[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]],
index=s_tokenized_output_index,
columns=["!", ".", "?", "TEST", "Test"],
).astype("Sparse[int64, 0]"),
],
[
"term_frequency",
representation.term_frequency,
[0.125, 0.125, 0.250, 0.250, 0.125, 0.125],
"float",
pd.DataFrame(
[[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]],
index=s_tokenized_output_index,
columns=["!", ".", "?", "TEST", "Test"],
dtype="Sparse",
).astype("Sparse[float64, nan]"),
],
[
"tfidf",
representation.tfidf,
[_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index],
"float",
pd.DataFrame(
[
[
_tfidf(x, s_tokenized, 0) # Testing the tfidf formula here
for x in ["!", ".", "?", "TEST", "Test"]
],
[_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]],
],
index=s_tokenized_output_index,
columns=["!", ".", "?", "TEST", "Test"],
).astype("Sparse[float64, nan]"),
],
]


test_cases_vectorization_min_df = [
# format: [function_name, function, correct output for tokenized input above, dtype of output]
["count", representation.count, [2, 1], "int"],
["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",],
["tfidf", representation.tfidf, [2.0, 1.0], "float",],
# format: [function_name, function, correct output for tokenized input above]
[
"count",
representation.count,
pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype(
"Sparse[int64, 0]"
),
],
[
"term_frequency",
representation.term_frequency,
pd.DataFrame(
[0.666667, 0.333333], index=s_tokenized_output_index, columns=["Test"],
).astype("Sparse[float64, nan]"),
],
[
"tfidf",
representation.tfidf,
pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype(
"Sparse[float64, nan]"
),
],
]


s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7])
s_DataFrame = pd.DataFrame(
[[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"],
).astype("Sparse[float64, nan]")


test_cases_dim_reduction_and_clustering = [
# format: [function_name, function, correct output for s_vector_series and s_DataFrame input above]
["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),],
[
"nmf",
representation.nmf,
pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],),
],
[
"tsne",
representation.tsne,
pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],),
],
[
"kmeans",
representation.kmeans,
pd.Series([1, 0], index=[5, 7], dtype="category"),
],
[
"dbscan",
representation.dbscan,
pd.Series([-1, -1], index=[5, 7], dtype="category"),
],
[
"meanshift",
representation.meanshift,
pd.Series([0, 1], index=[5, 7], dtype="category"),
],
[
"normalize",
representation.normalize,
pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],),
],
]


Expand All @@ -98,62 +173,25 @@ class AbstractRepresentationTest(PandasTestCase):
"""

@parameterized.expand(test_cases_vectorization)
def test_vectorization_simple(
self, name, test_function, correct_output_values, int_or_float
):
if int_or_float == "int":
s_true = pd.Series(
correct_output_values, index=s_tokenized_output_index, dtype="int"
).astype(pd.SparseDtype(np.int64, 0))
else:
s_true = pd.Series(
correct_output_values, index=s_tokenized_output_index, dtype="float"
).astype(pd.SparseDtype("float", np.nan))
def test_vectorization_simple(self, name, test_function, correct_output):
s_true = correct_output
result_s = test_function(s_tokenized)

pd.testing.assert_series_equal(s_true, result_s)
pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False)

@parameterized.expand(test_cases_vectorization)
def test_vectorization_noncontinuous_index_kept(
self, name, test_function, correct_output_values, int_or_float
self, name, test_function, correct_output=None
):
if int_or_float == "int":
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_noncontinuous_index,
dtype="int",
).astype(pd.SparseDtype(np.int64, 0))
else:
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_noncontinuous_index,
dtype="float",
).astype(pd.SparseDtype("float", np.nan))

result_s = test_function(s_tokenized_with_noncontinuous_index)

pd.testing.assert_series_equal(s_true, result_s)
pd.testing.assert_index_equal(
s_tokenized_output_index_noncontinous, result_s.index
)

@parameterized.expand(test_cases_vectorization_min_df)
def test_vectorization_min_df(
self, name, test_function, correct_output_values, int_or_float
):
if int_or_float == "int":
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_min_df_index,
dtype="int",
).astype(pd.SparseDtype(np.int64, 0))
else:
s_true = pd.Series(
correct_output_values,
index=s_tokenized_output_min_df_index,
dtype="float",
).astype(pd.SparseDtype("float", np.nan))

def test_vectorization_min_df(self, name, test_function, correct_output):
s_true = correct_output
result_s = test_function(s_tokenized, min_df=2)

pd.testing.assert_series_equal(s_true, result_s)
pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False)

@parameterized.expand(test_cases_vectorization)
def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args):
Expand All @@ -168,69 +206,65 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args):
self.fail("Sklearn arguments not handled correctly.")

"""
Individual / special tests.
"""

def test_tfidf_formula(self):
s = pd.Series(["Hi Bye", "Test Bye Bye"])
s = preprocessing.tokenize(s)
s_true_index = pd.MultiIndex.from_tuples(
[(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")],
)
s_true = pd.Series(
[_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index
).astype("Sparse")

self.assertEqual(representation.tfidf(s), s_true)

"""
flatten.
Dimensionality Reduction and Clustering
"""

def test_flatten(self):
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)
@parameterized.expand(test_cases_dim_reduction_and_clustering)
def test_dim_reduction_and_clustering_with_vector_series_input(
self, name, test_function, correct_output
):
s_true = correct_output

s_true = pd.Series(
[[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"],
)
if name == "kmeans":
result_s = test_function(s_vector_series, random_state=42, n_clusters=2)
elif name == "dbscan" or name == "meanshift" or name == "normalize":
result_s = test_function(s_vector_series)
else:
result_s = test_function(s_vector_series, random_state=42)

pd.testing.assert_series_equal(
representation.flatten(s), s_true, check_names=False
s_true,
result_s,
check_dtype=False,
rtol=0.1,
atol=0.1,
check_category_order=False,
)

def test_flatten_fill_missing_with(self):
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)
@parameterized.expand(test_cases_dim_reduction_and_clustering)
def test_dim_reduction_and_clustering_with_DataFrame_input(
henrifroese marked this conversation as resolved.
Show resolved Hide resolved
self, name, test_function, correct_output
):
s_true = correct_output

s_true = pd.Series(
[[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]],
index=["doc0", "doc1"],
)
if name == "normalize":
# testing this below separately
return

if name == "kmeans":
result_s = test_function(s_DataFrame, random_state=42, n_clusters=2)
elif name == "dbscan" or name == "meanshift" or name == "normalize":
result_s = test_function(s_DataFrame)
else:
result_s = test_function(s_DataFrame, random_state=42)

pd.testing.assert_series_equal(
representation.flatten(s, fill_missing_with="FILLED"),
s_true,
check_names=False,
)

def test_flatten_missing_row(self):
# Simulating a row with no features, so it's completely missing from
# the representation series.
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
result_s,
check_dtype=False,
rtol=0.1,
atol=0.1,
check_category_order=False,
)
s = pd.Series([3, np.nan, 4], index=index)

s_true = pd.Series(
[[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]],
index=["doc0", "doc1", "doc2"],
def test_normalize_DataFrame_also_as_output(self):
# normalize should also return DataFrame output for DataFrame
# input so we test it separately
result = representation.normalize(s_DataFrame)
correct_output = pd.DataFrame(
[[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"],
)

pd.testing.assert_series_equal(
representation.flatten(s, index=s_true.index), s_true, check_names=False
pd.testing.assert_frame_equal(
result, correct_output, check_dtype=False, rtol=0.1, atol=0.1,
)
Loading