From fa342a92d4f007cebfce29f1f22a4e31fedc56c6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 18 Aug 2020 22:06:14 +0200 Subject: [PATCH 01/23] added MultiIndex DF support suport MultiIndex as function parameter returns MultiIndex, where Representation was returned * missing: correct test Co-authored-by: Henri Froese --- tests/test_indexes.py | 18 +-- tests/test_representation.py | 63 +------- texthero/representation.py | 294 +++++++++++++---------------------- texthero/visualization.py | 4 +- 4 files changed, 115 insertions(+), 264 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af7afcd2 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,21 +56,9 @@ ] test_cases_representation = [ - [ - "count", - lambda x: representation.flatten(representation.count(x)), - (s_tokenized_lists,), - ], - [ - "term_frequency", - lambda x: representation.flatten(representation.term_frequency(x)), - (s_tokenized_lists,), - ], - [ - "tfidf", - lambda x: representation.flatten(representation.tfidf(x)), - (s_tokenized_lists,), - ], + ["count", representation.count, (s_tokenized_lists,),], + ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], + ["tfidf", representation.tfidf, (s_tokenized_lists,),], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..41b81ffa 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,16 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = pd.MultiIndex.from_tuples( - [(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")], -) - -s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples( - [(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")], -) - -s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_tokenized_output_index = [0,1] +s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -182,55 +175,3 @@ def test_tfidf_formula(self): ).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true) - - """ - flatten. - """ - - def test_flatten(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s), s_true, check_names=False - ) - - def test_flatten_fill_missing_with(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]], - index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, fill_missing_with="FILLED"), - s_true, - check_names=False, - ) - - def test_flatten_missing_row(self): - # Simulating a row with no features, so it's completely missing from - # the representation series. - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]], - index=["doc0", "doc1", "doc2"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, index=s_true.index), s_true, check_names=False - ) diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..042db71a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -27,90 +27,14 @@ """ -def flatten( - s: Union[pd.Series, pd.Series.sparse], - index: pd.Index = None, - fill_missing_with: Any = 0.0, -) -> pd.Series: - """ - Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series. - - The given Series should have a multiindex with first level being the document - and second level being individual features of that document (e.g. tdidf scores per word). - The flattened Series has one cell per document, with the cell being a list of all - the individual features of that document. - - Parameters - ---------- - s : Sparse Pandas Series or Pandas Series - The multiindexed Pandas Series to flatten. - - index : Pandas Index, optional, default to None - The index the flattened Series should have. - - fill_missing_with : Any, default to 0.0 - Value to fill the NaNs (missing values) with. This _does not_ mean - that existing values that are np.nan are replaced, but rather that - features that are not present in one document but present in others - are filled with fill_missing_with. See example below. - - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> import numpy as np - >>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word']) - >>> s = pd.Series([3, np.nan, 4], index=index) - >>> s - document word - doc0 Word1 3.0 - Word3 NaN - doc1 Word2 4.0 - dtype: float64 - >>> hero.flatten(s, fill_missing_with=0.0) - document - doc0 [3.0, 0.0, nan] - doc1 [0.0, 4.0, 0.0] - dtype: object - - """ - s = s.unstack(fill_value=fill_missing_with) - - if index is not None: - s = s.reindex(index, fill_value=fill_missing_with) - # Reindexing makes the documents for which no values - # are present in the Sparse Representation Series - # "reappear" correctly. - - s = pd.Series(s.values.tolist(), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: +def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: """ - Check if the given Pandas Series is a Document Representation Series. + Check if the given Pandas Series is a Document Term DF. - Returns true if Series is Document Representation Series, else False. + Returns true if input is Document Term DF, else False. """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True + return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -132,11 +56,11 @@ def count( min_df=1, max_df=1.0, binary=False, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using count. - Return a Document Representation Series with the + Return a Document Term DataFrame with the number of occurences of a document's words for every document. TODO add tutorial link @@ -144,10 +68,6 @@ def count( The input Series should already be tokenized. If not, it will be tokenized before count is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -177,15 +97,14 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - 0 Sentence 1 - one 1 - 1 Sentence 1 - two 1 - dtype: Sparse[int64, 0] + count + Sentence one two + 0 1 1 0 + 1 1 0 1 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -204,25 +123,23 @@ def count( ) tf_vectors_csr = tf.fit_transform(s) - tf_vectors_coo = coo_matrix(tf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tf_vectors_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("count", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tf_vectors_csr, s.index, multiindexed_columns + ) def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. - Return a Document Representation Series with the + Return a Document Term DataFrame with the term frequencies of the terms for every document. TODO add tutorial link @@ -230,11 +147,6 @@ def term_frequency( The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - - Parameters ---------- s : Pandas Series (tokenized) @@ -261,16 +173,14 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) - 0 Sentence 0.2 - hey 0.2 - one 0.2 - 1 Sentence 0.2 - two 0.2 - dtype: Sparse[float64, nan] + term_frequency + Sentence hey one two + 0 0.2 0.2 0.2 0.0 + 1 0.2 0.0 0.0 0.2 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -291,17 +201,16 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - s_out = pd.Series.sparse.from_coo(frequency_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("term_frequency", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + frequency_coo, s.index, multiindexed_columns + ) -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: +def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -324,20 +233,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: so the result is exactly what you get applying the formula described above. - Return a Document Representation Series with the + Return a Document Term DataFrame with the tfidf of every word in the document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. - If working with big pandas Series, you might want to limit - the number of features through the max_features parameter. - - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -365,17 +267,16 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) - 0 Bye 1.000000 - Hi 1.405465 - 1 Bye 2.000000 - Test 1.405465 - dtype: Sparse[float64, nan] + tfidf + Bye Hi Test + 0 1.0 1.405465 0.000000 + 1 2.0 0.000000 1.405465 See Also -------- `TF-IDF on Wikipedia `_ - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -395,16 +296,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: tfidf_vectors_csr = tfidf.fit_transform(s) - # Result from sklearn is in Compressed Sparse Row format. - # Pandas Sparse Series can only be initialized from Coordinate format. - tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) - - # Map word index to word name and keep original index of documents. - feature_names = tfidf.get_feature_names() - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("tfidf", word) for word in tfidf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tfidf_vectors_csr, s.index, multiindexed_columns + ) """ @@ -412,7 +310,9 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: """ -def pca(s, n_components=2, random_state=None) -> pd.Series: +def pca( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Perform principal component analysis on the given Pandas Series. @@ -434,7 +334,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or MuliIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -468,10 +368,18 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.values + else: + values = list(s) + + return pd.Series(pca.fit_transform(values).tolist(), index=s.index) -def nmf(s, n_components=2, random_state=None) -> pd.Series: +def nmf( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Performs non-negative matrix factorization. @@ -491,7 +399,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -527,11 +435,17 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) def tsne( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_components=2, perplexity=30.0, learning_rate=200.0, @@ -557,7 +471,7 @@ def tsne( Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -619,7 +533,13 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) """ @@ -628,7 +548,7 @@ def tsne( def kmeans( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_clusters=5, n_init=10, max_iter=300, @@ -653,7 +573,7 @@ def kmeans( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -686,7 +606,7 @@ def kmeans( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 1 0 @@ -702,7 +622,12 @@ def kmeans( `kmeans on Wikipedia `_ """ - vectors = list(s) + + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + kmeans = KMeans( n_clusters=n_clusters, n_init=n_init, @@ -715,7 +640,7 @@ def kmeans( def dbscan( - s, + s: Union[pd.Series, pd.DataFrame], eps=0.5, min_samples=5, metric="euclidean", @@ -743,7 +668,7 @@ def dbscan( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -783,7 +708,7 @@ def dbscan( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) 0 0 1 1 @@ -801,6 +726,11 @@ def dbscan( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + return pd.Series( DBSCAN( eps=eps, @@ -809,13 +739,13 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") def meanshift( - s, + s: Union[pd.Series, pd.DataFrame], bandwidth=None, bin_seeding=False, min_bin_freq=1, @@ -843,7 +773,7 @@ def meanshift( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -901,6 +831,11 @@ def meanshift( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.values + else: + vectors = list(s) + return pd.Series( MeanShift( bandwidth=bandwidth, @@ -909,7 +844,7 @@ def meanshift( cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") @@ -962,31 +897,18 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: `Norm on Wikipedia `_ """ + isDocumentTermDF = _check_is_valid_DocumentTermDF(s) - is_valid_representation = ( - isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2 - ) - - if not is_valid_representation: - raise TypeError( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - ) - # TODO after merging representation: use _check_is_valid_representation instead - - if pd.api.types.is_sparse(s): - s_coo_matrix = s.sparse.to_coo()[0] + if isDocumentTermDF: + s_for_vectorization = s.sparse.to_coo() else: - s = s.astype("Sparse") - s_coo_matrix = s.sparse.to_coo()[0] - - s_for_vectorization = s_coo_matrix + s_for_vectorization = list(s) result = sklearn_normalize( s_for_vectorization, norm=norm ) # Can handle sparse input. - result_coo = coo_matrix(result) - s_result = pd.Series.sparse.from_coo(result_coo) - s_result.index = s.index - - return s_result + if isDocumentTermDF: + return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) + else: + return pd.Series(result.tolist(), index=s.index) diff --git a/texthero/visualization.py b/texthero/visualization.py index e213285e..2426ab4d 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -63,8 +63,8 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten - >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) + >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.kmeans, n_clusters=2) >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """ From 59a9f8c0df70d8136780b3160bc1d2ca59f48b26 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 19 Aug 2020 19:39:30 +0200 Subject: [PATCH 02/23] beginning with tests --- tests/test_representation.py | 147 +++++++++++++++++------------------ texthero/representation.py | 8 +- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 41b81ffa..d4acd369 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,32 +50,84 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0,1] +s_tokenized_output_index = [0, 1] + +s_tokenized_output_index_noncontinous = [5, 7] + + +def _get_multiindex_for_tokenized_output(first_level_name): + return pd.MultiIndex.from_product( + [[first_level_name], ["!", ".", "?", "TEST", "Test"]] + ) -s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [1, 1, 2, 2, 1, 1], "int"], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("count"), + ).astype("Sparse"), + ], [ "term_frequency", representation.term_frequency, - [0.125, 0.125, 0.250, 0.250, 0.125, 0.125], - "float", + pd.DataFrame( + [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("term_frequency"), + ).astype("Sparse"), ], [ "tfidf", representation.tfidf, - [_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index], - "float", + pd.DataFrame( + [ + [ + _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here + for x in ["!", ".", "?", "TEST", "Test"] + ], + [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + ], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("tfidf"), + ).astype("Sparse"), ], ] + test_cases_vectorization_min_df = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [2, 1], "int"], - ["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",], - ["tfidf", representation.tfidf, [2.0, 1.0], "float",], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [2, 1], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("count", "Test")]), + ).astype("Sparse"), + ], + [ + "term_frequency", + representation.term_frequency, + pd.DataFrame( + [0.666667, 0.333333], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + ).astype("Sparse"), + ], + [ + "tfidf", + representation.tfidf, + pd.DataFrame( + [2.0, 1.0], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + ).astype("Sparse"), + ], ] @@ -91,62 +143,23 @@ class AbstractRepresentationTest(PandasTestCase): """ @parameterized.expand(test_cases_vectorization) - def test_vectorization_simple( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="int" - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="float" - ).astype(pd.SparseDtype("float", np.nan)) + def test_vectorization_simple(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( - self, name, test_function, correct_output_values, int_or_float + self, name, test_function, correct_output=None ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - result_s = test_function(s_tokenized_with_noncontinuous_index) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) @parameterized.expand(test_cases_vectorization_min_df) - def test_vectorization_min_df( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - + def test_vectorization_min_df(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -159,19 +172,3 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") - - """ - Individual / special tests. - """ - - def test_tfidf_formula(self): - s = pd.Series(["Hi Bye", "Test Bye Bye"]) - s = preprocessing.tokenize(s) - s_true_index = pd.MultiIndex.from_tuples( - [(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], - ) - s_true = pd.Series( - [_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index - ).astype("Sparse") - - self.assertEqual(representation.tfidf(s), s_true) diff --git a/texthero/representation.py b/texthero/representation.py index 042db71a..efabc9c6 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,11 +97,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - count - Sentence one two + count + Sentence one two 0 1 1 0 1 1 0 1 - +# FIXME columns pandas doctest See Also -------- Document Term DataFrame: TODO add tutorial link @@ -375,7 +375,7 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) - +# FIXME: merge master again def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None From 19c52de3f5ae6a1a01e4262dca00ea5177718311 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 19 Aug 2020 22:02:41 +0200 Subject: [PATCH 03/23] implemented correct sparse support *missing: test adopting for new types Co-authored-by: Henri Froese --- tests/test_representation.py | 12 ++++---- texthero/representation.py | 59 +++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index d4acd369..7c02ccd2 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -70,7 +70,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("count"), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -108,7 +108,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("count", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -123,7 +123,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "tfidf", representation.tfidf, pd.DataFrame( - [2.0, 1.0], + [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), ).astype("Sparse"), @@ -146,20 +146,20 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) + pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): diff --git a/texthero/representation.py b/texthero/representation.py index efabc9c6..ff691212 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -101,9 +101,12 @@ def count( Sentence one two 0 1 1 0 1 1 0 1 -# FIXME columns pandas doctest + See Also -------- + + # FIXME columns pandas doctest + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -375,8 +378,11 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + + # FIXME: merge master again + def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -437,11 +443,12 @@ def nmf( nmf = NMF(n_components=n_components, init="random", random_state=random_state,) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) + return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) def tsne( @@ -535,11 +542,12 @@ def tsne( ) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) + return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) """ @@ -624,9 +632,10 @@ def kmeans( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) kmeans = KMeans( n_clusters=n_clusters, @@ -635,8 +644,8 @@ def kmeans( random_state=random_state, copy_x=True, algorithm=algorithm, - ).fit(vectors) - return pd.Series(kmeans.predict(vectors), index=s.index).astype("category") + ).fit(s_for_vectorization) + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") def dbscan( @@ -727,9 +736,10 @@ def dbscan( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) return pd.Series( DBSCAN( @@ -739,7 +749,7 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(vectors), + ).fit_predict(s_for_vectorization), index=s.index, ).astype("category") @@ -877,17 +887,15 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> idx = pd.MultiIndex.from_tuples( - ... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word") - ... ) - >>> s = pd.Series([1, 2, 3, 4], index=idx) + >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") - document word - 0 a 0.50 - b 1.00 - 1 c 0.75 - d 1.00 - dtype: Sparse[float64, nan] + 0 1 + a b c d + 0 0.250000 0.500000 0.75 1.000000 + 1 0.571429 0.285714 1.00 0.714286 + 2 0.400000 0.400000 0.60 1.000000 + 3 0.111111 0.222222 1.00 0.888889 See Also @@ -900,7 +908,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: isDocumentTermDF = _check_is_valid_DocumentTermDF(s) if isDocumentTermDF: - s_for_vectorization = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: s_for_vectorization = list(s) From 41f55a8a359f15ce4ba65e1e726b9e0757fc596b Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:20:02 +0200 Subject: [PATCH 04/23] added back list() and rm .tolist() --- texthero/representation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 048b42ec..025652d9 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,7 +37,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(s.values.tolist(), index=s.index) + s = pd.Series(list(s.values), index=s.index) return s @@ -415,7 +415,7 @@ def pca( else: values = list(s) - return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + return pd.Series(list(pca.fit_transform(values)), index=s.index) # FIXME: merge master again @@ -489,7 +489,7 @@ def nmf( else: s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) def tsne( @@ -589,7 +589,7 @@ def tsne( else: s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) """ @@ -963,4 +963,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(result.tolist(), index=s.index) + return pd.Series(list(result), index=s.index) From 217611a2c648db4044d240a9c12a157b94b36bca Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:21:41 +0200 Subject: [PATCH 05/23] rm .tolist() and added list() --- texthero/representation.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 025652d9..fdab73dd 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,36 +37,6 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(list(s.values), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: - """ - Check if the given Pandas Series is a Document Representation Series. - - Returns true if Series is Document Representation Series, else False. - - """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This function will" @@ -963,4 +933,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(list(result), index=s.index) + return pd.Series((result), index=s.index) From 6a3b56d1a56401880efa7cfa7dd32668e23b25ea Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:22 +0200 Subject: [PATCH 06/23] Adopted the test to the new dataframes --- tests/test_representation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 7c02ccd2..3564730e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -90,7 +90,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here for x in ["!", ".", "?", "TEST", "Test"] ], - [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), @@ -146,20 +146,28 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) + pd.testing.assert_series_equal( + pd.Series(s_tokenized_output_index_noncontinous), + pd.Series(result_s.index), + check_dtype=False, + ) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): From b8ff5611e550f5f4bc023b2b76ef8ebcff7f8021 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:35 +0200 Subject: [PATCH 07/23] wrong format --- texthero/representation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index fdab73dd..ac0a458f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -657,7 +657,9 @@ def kmeans( copy_x=True, algorithm=algorithm, ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype( + "category" + ) def dbscan( From e3af2f9da094505861cddc420f57490700ca88ef Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 18:48:51 +0200 Subject: [PATCH 08/23] Address most review comments. --- tests/test_representation.py | 19 ++++++++-------- texthero/representation.py | 42 +++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 3564730e..5f985996 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,9 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0, 1] +s_tokenized_output_index = pd.Index([0, 1]) -s_tokenized_output_index_noncontinous = [5, 7] +s_tokenized_output_index_noncontinous = pd.Index([5, 7]) def _get_multiindex_for_tokenized_output(first_level_name): @@ -79,7 +79,8 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("term_frequency"), - ).astype("Sparse"), + dtype="Sparse", + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -94,7 +95,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -117,7 +118,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [0.666667, 0.333333], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -126,7 +127,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -155,10 +156,8 @@ def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal( - pd.Series(s_tokenized_output_index_noncontinous), - pd.Series(result_s.index), - check_dtype=False, + pd.testing.assert_index_equal( + s_tokenized_output_index_noncontinous, result_s.index ) @parameterized.expand(test_cases_vectorization_min_df) diff --git a/texthero/representation.py b/texthero/representation.py index ac0a458f..7793cb2b 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -145,7 +145,7 @@ def term_frequency( Return a Document Term DataFrame with the term frequencies of the terms for every - document. + document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -241,7 +241,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram formula described above. Return a Document Term DataFrame with the - tfidf of every word in the document. + tfidf of every word in the document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -341,9 +341,13 @@ def pca( In general, *pca* should be called after the text has already been represented to a matrix form. + PCA cannot directly handle sparse input, so when calling pca on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s : Pandas Series or MuliIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -388,9 +392,6 @@ def pca( return pd.Series(list(pca.fit_transform(values)), index=s.index) -# FIXME: merge master again - - def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -410,10 +411,12 @@ def nmf( n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. + NMF can directly handle sparse input, so when calling nmf on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -484,10 +487,12 @@ def tsne( document gets a new, low-dimensional (n_components entries) vector in such a way that the differences / similarities between documents are preserved. + T-SNE can directly handle sparse input, so when calling tsne on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -591,9 +596,12 @@ def kmeans( function that assigns a scalar (a weight) to each word), K-means will find k topics (clusters) and assign a topic to each document. + Kmeans can directly handle sparse input, so when calling kmeans on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -689,9 +697,12 @@ def dbscan( function that assigns a scalar (a weight) to each word), DBSCAN will find topics (clusters) and assign a topic to each document. + DBSCAN can directly handle sparse input, so when calling dbscan on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -795,9 +806,13 @@ def meanshift( function that assigns a scalar (a weight) to each word), mean shift will find topics (clusters) and assign a topic to each document. + Menashift cannot directly handle sparse input, so when calling meanshift on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -889,11 +904,12 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. - Input has to be a Representation Series. + Input can be VectorSeries or DocumentTermDF. For DocumentTermDFs, + the sparseness is kept. Parameters ---------- - s: Pandas Series + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. From 77ad80ecf8977a098b73c4f12c8f28951c769dfc Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 19:45:48 +0200 Subject: [PATCH 09/23] Add more unittests for representation --- tests/test_representation.py | 118 +++++++++++++++++++++++++++++++++-- texthero/representation.py | 14 ++--- 2 files changed, 118 insertions(+), 14 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 5f985996..2722289e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -132,6 +132,50 @@ def _get_multiindex_for_tokenized_output(first_level_name): ] +s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) +s_documenttermDF = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), +).astype("Sparse[float64, nan]") + + +test_cases_dim_reduction_and_clustering = [ + # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] + ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], + [ + "nmf", + representation.nmf, + pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), + ], + [ + "tsne", + representation.tsne, + pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), + ], + [ + "kmeans", + representation.kmeans, + pd.Series([1, 0], index=[5, 7], dtype="category"), + ], + [ + "dbscan", + representation.dbscan, + pd.Series([-1, -1], index=[5, 7], dtype="category"), + ], + [ + "meanshift", + representation.meanshift, + pd.Series([0, 1], index=[5, 7], dtype="category"), + ], + [ + "normalize", + representation.normalize, + pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), + ], +] + + class AbstractRepresentationTest(PandasTestCase): """ Class for representation test cases. Most tests are @@ -147,9 +191,7 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( @@ -164,9 +206,7 @@ def test_vectorization_noncontinuous_index_kept( def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -179,3 +219,69 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction and Clustering + """ + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_vector_series_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "kmeans": + result_s = test_function(s_vector_series, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_vector_series) + else: + result_s = test_function(s_vector_series, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_documenttermDF_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "normalize": + # testing this below separately + return + + if name == "kmeans": + result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_documenttermDF) + else: + result_s = test_function(s_documenttermDF, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + def test_normalize_documenttermDF_also_as_output(self): + # normalize should also return DocumentTermDF output for DocumentTermDF + # input so we test it separately + result = representation.normalize(s_documenttermDF) + correct_output = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + ) + + pd.testing.assert_frame_equal( + result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, + ) diff --git a/texthero/representation.py b/texthero/representation.py index 7793cb2b..8e876088 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,7 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) + >>> hero.count(s) # doctest: +SKIP count Sentence one two 0 1 1 0 @@ -106,8 +106,6 @@ def count( See Also -------- - # FIXME columns pandas doctest - Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -177,7 +175,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) + >>> hero.term_frequency(s) # doctest: +SKIP term_frequency Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -273,7 +271,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) + >>> hero.tfidf(s) # doctest: +SKIP tfidf Bye Hi Test 0 1.0 1.405465 0.000000 @@ -900,7 +898,7 @@ def meanshift( """ -def normalize(s: pd.Series, norm="l2") -> pd.Series: +def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. @@ -920,7 +918,7 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: >>> import pandas as pd >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") + >>> hero.normalize(s, norm="max") # doctest: +SKIP 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 @@ -951,4 +949,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series((result), index=s.index) + return pd.Series(list(result), index=s.index) From e2768b543b76b5d4d496f82061c70e5481317d61 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:04:45 +0200 Subject: [PATCH 10/23] implemented the suggested changes --- setup.cfg | 2 +- tests/test_representation.py | 23 ++++++++--------------- texthero/representation.py | 29 +++++++---------------------- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/setup.cfg b/setup.cfg index d6103b02..cc082845 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black=19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 diff --git a/tests/test_representation.py b/tests/test_representation.py index 2722289e..e1db52e1 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -54,13 +54,6 @@ def _tfidf(term, corpus, document_index): s_tokenized_output_index_noncontinous = pd.Index([5, 7]) - -def _get_multiindex_for_tokenized_output(first_level_name): - return pd.MultiIndex.from_product( - [[first_level_name], ["!", ".", "?", "TEST", "Test"]] - ) - - test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above] [ @@ -69,7 +62,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, - columns=_get_multiindex_for_tokenized_output("count"), + columns=["!", ".", "?", "TEST", "Test"], ).astype("Sparse[int64, 0]"), ], [ @@ -78,7 +71,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, - columns=_get_multiindex_for_tokenized_output("term_frequency"), + columns=["!", ".", "?", "TEST", "Test"], dtype="Sparse", ).astype("Sparse[float64, nan]"), ], @@ -94,7 +87,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, - columns=_get_multiindex_for_tokenized_output("tfidf"), + columns=["!", ".", "?", "TEST", "Test"], ).astype("Sparse[float64, nan]"), ], ] @@ -108,7 +101,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [2, 1], index=s_tokenized_output_index, - columns=pd.MultiIndex.from_tuples([("count", "Test")]), + columns=["Test"], ).astype("Sparse[int64, 0]"), ], [ @@ -117,7 +110,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [0.666667, 0.333333], index=s_tokenized_output_index, - columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + columns=[ "Test"], ).astype("Sparse[float64, nan]"), ], [ @@ -126,7 +119,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [2, 1], index=s_tokenized_output_index, - columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + columns= ["Test"], ).astype("Sparse[float64, nan]"), ], ] @@ -136,7 +129,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): s_documenttermDF = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], - columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + columns=["a", "b"], ).astype("Sparse[float64, nan]") @@ -279,7 +272,7 @@ def test_normalize_documenttermDF_also_as_output(self): correct_output = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], - columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + columns= ["a", "b"], ) pd.testing.assert_frame_equal( diff --git a/texthero/representation.py b/texthero/representation.py index 8e876088..ce8aae9c 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -34,7 +34,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: Returns true if input is Document Term DF, else False. """ - return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) + return isinstance(df, pd.DataFrame) and not isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -97,8 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) # doctest: +SKIP - count + >>> hero.count(s) # doctest: +SKIP Sentence one two 0 1 1 0 1 1 0 1 @@ -126,12 +125,8 @@ def count( tf_vectors_csr = tf.fit_transform(s) - multiindexed_columns = pd.MultiIndex.from_tuples( - [("count", word) for word in tf.get_feature_names()] - ) - return pd.DataFrame.sparse.from_spmatrix( - tf_vectors_csr, s.index, multiindexed_columns + tf_vectors_csr, s.index, tf.get_feature_names() ) @@ -175,8 +170,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency + >>> hero.term_frequency(s) # doctest: +SKIP Sentence hey one two 0 0.2 0.2 0.2 0.0 1 0.2 0.0 0.0 0.2 @@ -204,12 +198,8 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - multiindexed_columns = pd.MultiIndex.from_tuples( - [("term_frequency", word) for word in tf.get_feature_names()] - ) - return pd.DataFrame.sparse.from_spmatrix( - frequency_coo, s.index, multiindexed_columns + frequency_coo, s.index, tf.get_feature_names() ) @@ -271,8 +261,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) # doctest: +SKIP - tfidf + >>> hero.tfidf(s) # doctest: +SKIP Bye Hi Test 0 1.0 1.405465 0.000000 1 2.0 0.000000 1.405465 @@ -301,12 +290,8 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram tfidf_vectors_csr = tfidf.fit_transform(s) - multiindexed_columns = pd.MultiIndex.from_tuples( - [("tfidf", word) for word in tfidf.get_feature_names()] - ) - return pd.DataFrame.sparse.from_spmatrix( - tfidf_vectors_csr, s.index, multiindexed_columns + tfidf_vectors_csr, s.index, tfidf.get_feature_names() ) From b09f6242c8b97865aaa5a23a9c52e2b1b1ffa4c2 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:08:26 +0200 Subject: [PATCH 11/23] fixed messy docstring --- texthero/representation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index ce8aae9c..f7e2a01a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -134,11 +134,11 @@ def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, ) -> pd.DataFrame: """ - Represent a text-based Pandas Series using term frequency. + Return a count document-term DataFrame based on the given Pandas Series - Return a Document Term DataFrame with the - term frequencies of the terms for every - document. The output is sparse. + Rows of the returned DataFrame represent document whereas columns are terms. + The value in the cell document-term is the frequency of the term in + this document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will From 508c3617988302a898f6c623eba6b02e910b2aee Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:18:49 +0200 Subject: [PATCH 12/23] fix black issues --- .travis.yml | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index cc082845..3f86e7f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 From 75e955fbce511e353d3381a1cb55d1e826bf8fef Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:19:25 +0200 Subject: [PATCH 13/23] fix formatting --- tests/test_representation.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index e1db52e1..ff821efb 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -98,38 +98,30 @@ def _tfidf(term, corpus, document_index): [ "count", representation.count, - pd.DataFrame( - [2, 1], - index=s_tokenized_output_index, - columns=["Test"], - ).astype("Sparse[int64, 0]"), + pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype( + "Sparse[int64, 0]" + ), ], [ "term_frequency", representation.term_frequency, pd.DataFrame( - [0.666667, 0.333333], - index=s_tokenized_output_index, - columns=[ "Test"], + [0.666667, 0.333333], index=s_tokenized_output_index, columns=["Test"], ).astype("Sparse[float64, nan]"), ], [ "tfidf", representation.tfidf, - pd.DataFrame( - [2, 1], - index=s_tokenized_output_index, - columns= ["Test"], - ).astype("Sparse[float64, nan]"), + pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype( + "Sparse[float64, nan]" + ), ], ] s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) s_documenttermDF = pd.DataFrame( - [[1.0, 0.0], [0.0, 0.0]], - index=[5, 7], - columns=["a", "b"], + [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ).astype("Sparse[float64, nan]") @@ -270,9 +262,7 @@ def test_normalize_documenttermDF_also_as_output(self): # input so we test it separately result = representation.normalize(s_documenttermDF) correct_output = pd.DataFrame( - [[1.0, 0.0], [0.0, 0.0]], - index=[5, 7], - columns= ["a", "b"], + [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ) pd.testing.assert_frame_equal( From 7fec40e1d4e943f6ac00ff22b2ddfb2b7cf104ca Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 7 Sep 2020 20:09:06 +0200 Subject: [PATCH 14/23] changed s to input_matrix --- texthero/representation.py | 100 ++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index f7e2a01a..eb0ee506 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -301,7 +301,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram def pca( - s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None + input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: """ Perform principal component analysis on the given Pandas Series. @@ -330,7 +330,7 @@ def pca( Parameters ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -367,16 +367,16 @@ def pca( """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - if _check_is_valid_DocumentTermDF(s): - values = s.values + if _check_is_valid_DocumentTermDF(input_matrix): + values = input_matrix.values else: - values = list(s) + values = list(input_matrix) - return pd.Series(list(pca.fit_transform(values)), index=s.index) + return pd.Series(list(pca.fit_transform(values)), index=input_matrix.index) def nmf( - s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None + input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: """ Performs non-negative matrix factorization. @@ -399,7 +399,7 @@ def nmf( Parameters ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -439,17 +439,17 @@ def nmf( """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + if _check_is_valid_DocumentTermDF(input_matrix): + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + input_matrix_for_vectorization = list(input_matrix) - return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) + return pd.Series(list(nmf.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index) def tsne( - s: Union[pd.Series, pd.DataFrame], + input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, perplexity=30.0, learning_rate=200.0, @@ -475,7 +475,7 @@ def tsne( Parameters ---------- - s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -541,13 +541,13 @@ def tsne( n_jobs=n_jobs, ) - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + if _check_is_valid_DocumentTermDF(input_matrix): + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + s_for_vectorization = list(input_matrix) - return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) + return pd.Series(list(tsne.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index) """ @@ -556,7 +556,7 @@ def tsne( def kmeans( - s: Union[pd.Series, pd.DataFrame], + input_matrix: Union[pd.Series, pd.DataFrame], n_clusters=5, n_init=10, max_iter=300, @@ -584,7 +584,7 @@ def kmeans( Parameters ---------- - s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -634,11 +634,11 @@ def kmeans( """ - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + if _check_is_valid_DocumentTermDF(input_matrix): + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + s_for_vectorization = list(input_matrix) kmeans = KMeans( n_clusters=n_clusters, @@ -648,13 +648,13 @@ def kmeans( copy_x=True, algorithm=algorithm, ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype( + return pd.Series(kmeans.predict(s_for_vectorization), index=input_matrix.index).astype( "category" ) def dbscan( - s: Union[pd.Series, pd.DataFrame], + input_matrix: Union[pd.Series, pd.DataFrame], eps=0.5, min_samples=5, metric="euclidean", @@ -685,7 +685,7 @@ def dbscan( Parameters ---------- - s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -743,11 +743,11 @@ def dbscan( """ - if _check_is_valid_DocumentTermDF(s): - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + if _check_is_valid_DocumentTermDF(input_matrix): + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + input_matrix_for_vectorization = list(input_matrix) return pd.Series( DBSCAN( @@ -757,13 +757,13 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(s_for_vectorization), - index=s.index, + ).fit_predict(input_matrix_for_vectorization), + index=input_matrix.index, ).astype("category") def meanshift( - s: Union[pd.Series, pd.DataFrame], + input_matrix: Union[pd.Series, pd.DataFrame], bandwidth=None, bin_seeding=False, min_bin_freq=1, @@ -795,7 +795,7 @@ def meanshift( Parameters ---------- - s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -854,10 +854,10 @@ def meanshift( """ - if _check_is_valid_DocumentTermDF(s): - vectors = s.values + if _check_is_valid_DocumentTermDF(input_matrix): + vectors = input_matrix.values else: - vectors = list(s) + vectors = list(input_matrix) return pd.Series( MeanShift( @@ -868,7 +868,7 @@ def meanshift( n_jobs=n_jobs, max_iter=max_iter, ).fit_predict(vectors), - index=s.index, + index=input_matrix.index, ).astype("category") @@ -883,7 +883,7 @@ def meanshift( """ -def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: +def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. @@ -892,7 +892,7 @@ def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: Parameters ---------- - s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. @@ -919,19 +919,19 @@ def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: `Norm on Wikipedia `_ """ - isDocumentTermDF = _check_is_valid_DocumentTermDF(s) + isDocumentTermDF = _check_is_valid_DocumentTermDF(input_matrix) if isDocumentTermDF: - s_coo = s.sparse.to_coo() - s_for_vectorization = s_coo.astype("float64") + input_matrix_coo = input_matrix.sparse.to_coo() + input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(s) + input_matrix_for_vectorization = list(input_matrix) result = sklearn_normalize( - s_for_vectorization, norm=norm + input_matrix_for_vectorization, norm=norm ) # Can handle sparse input. if isDocumentTermDF: - return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) + return pd.DataFrame.sparse.from_spmatrix(result, input_matrix.index, input_matrix.columns) else: - return pd.Series(list(result), index=s.index) + return pd.Series(list(result), index=input_matrix.index) From 4b5c67be337bfb7d90e2fcf3fb355332d762b486 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 7 Sep 2020 21:09:33 +0200 Subject: [PATCH 15/23] resolving smaller comment issues --- tests/test_representation.py | 40 +++++++++++++++--------------- texthero/representation.py | 48 ++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index ff821efb..4eab08fa 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,9 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = pd.Index([0, 1]) +tokenized_output_index = pd.Index([0, 1]) -s_tokenized_output_index_noncontinous = pd.Index([5, 7]) +tokenized_output_noncontinous_index = pd.Index([5, 7]) test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above] @@ -61,7 +61,7 @@ def _tfidf(term, corpus, document_index): representation.count, pd.DataFrame( [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], - index=s_tokenized_output_index, + index=tokenized_output_index, columns=["!", ".", "?", "TEST", "Test"], ).astype("Sparse[int64, 0]"), ], @@ -70,7 +70,7 @@ def _tfidf(term, corpus, document_index): representation.term_frequency, pd.DataFrame( [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], - index=s_tokenized_output_index, + index=tokenized_output_index, columns=["!", ".", "?", "TEST", "Test"], dtype="Sparse", ).astype("Sparse[float64, nan]"), @@ -86,7 +86,7 @@ def _tfidf(term, corpus, document_index): ], [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], - index=s_tokenized_output_index, + index=tokenized_output_index, columns=["!", ".", "?", "TEST", "Test"], ).astype("Sparse[float64, nan]"), ], @@ -98,7 +98,7 @@ def _tfidf(term, corpus, document_index): [ "count", representation.count, - pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype( + pd.DataFrame([2, 1], index=tokenized_output_index, columns=["Test"],).astype( "Sparse[int64, 0]" ), ], @@ -106,21 +106,21 @@ def _tfidf(term, corpus, document_index): "term_frequency", representation.term_frequency, pd.DataFrame( - [0.666667, 0.333333], index=s_tokenized_output_index, columns=["Test"], + [0.666667, 0.333333], index=tokenized_output_index, columns=["Test"], ).astype("Sparse[float64, nan]"), ], [ "tfidf", representation.tfidf, - pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype( + pd.DataFrame([2, 1], index=tokenized_output_index, columns=["Test"],).astype( "Sparse[float64, nan]" ), ], ] -s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) -s_documenttermDF = pd.DataFrame( +vector_s = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) +document_term_df = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ).astype("Sparse[float64, nan]") @@ -184,7 +184,7 @@ def test_vectorization_noncontinuous_index_kept( ): result_s = test_function(s_tokenized_with_noncontinuous_index) pd.testing.assert_index_equal( - s_tokenized_output_index_noncontinous, result_s.index + tokenized_output_noncontinous_index, result_s.index ) @parameterized.expand(test_cases_vectorization_min_df) @@ -216,11 +216,11 @@ def test_dim_reduction_and_clustering_with_vector_series_input( s_true = correct_output if name == "kmeans": - result_s = test_function(s_vector_series, random_state=42, n_clusters=2) + result_s = test_function(vector_s, random_state=42, n_clusters=2) elif name == "dbscan" or name == "meanshift" or name == "normalize": - result_s = test_function(s_vector_series) + result_s = test_function(vector_s) else: - result_s = test_function(s_vector_series, random_state=42) + result_s = test_function(vector_s, random_state=42) pd.testing.assert_series_equal( s_true, @@ -242,11 +242,11 @@ def test_dim_reduction_and_clustering_with_documenttermDF_input( return if name == "kmeans": - result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) - elif name == "dbscan" or name == "meanshift" or name == "normalize": - result_s = test_function(s_documenttermDF) + result_s = test_function(document_term_df, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift": + result_s = test_function(document_term_df) else: - result_s = test_function(s_documenttermDF, random_state=42) + result_s = test_function(document_term_df, random_state=42) pd.testing.assert_series_equal( s_true, @@ -257,10 +257,10 @@ def test_dim_reduction_and_clustering_with_documenttermDF_input( check_category_order=False, ) - def test_normalize_documenttermDF_also_as_output(self): + def test_normalize_document_term_df_also_as_output(self): # normalize should also return DocumentTermDF output for DocumentTermDF # input so we test it separately - result = representation.normalize(s_documenttermDF) + result = representation.normalize(document_term_df) correct_output = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ) diff --git a/texthero/representation.py b/texthero/representation.py index eb0ee506..2223a981 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -134,7 +134,7 @@ def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, ) -> pd.DataFrame: """ - Return a count document-term DataFrame based on the given Pandas Series + Return a Term Frequenzy document-term DataFrame based on the given Pandas Series Rows of the returned DataFrame represent document whereas columns are terms. The value in the cell document-term is the frequency of the term in @@ -330,7 +330,7 @@ def pca( Parameters ---------- - input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -399,7 +399,7 @@ def nmf( Parameters ---------- - input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -445,7 +445,10 @@ def nmf( else: input_matrix_for_vectorization = list(input_matrix) - return pd.Series(list(nmf.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index) + return pd.Series( + list(nmf.fit_transform(input_matrix_for_vectorization)), + index=input_matrix.index, + ) def tsne( @@ -475,7 +478,7 @@ def tsne( Parameters ---------- - input_matrix : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -545,9 +548,12 @@ def tsne( input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(input_matrix) + input_matrix_for_vectorization = list(input_matrix) - return pd.Series(list(tsne.fit_transform(input_matrix_for_vectorization)), index=input_matrix.index) + return pd.Series( + list(tsne.fit_transform(input_matrix_for_vectorization)), + index=input_matrix.index, + ) """ @@ -584,7 +590,7 @@ def kmeans( Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -616,7 +622,8 @@ def kmeans( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"]) + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", + ... "football, fun, sports", "music, fun, guitar"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 @@ -638,7 +645,7 @@ def kmeans( input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: - s_for_vectorization = list(input_matrix) + input_matrix_for_vectorization = list(input_matrix) kmeans = KMeans( n_clusters=n_clusters, @@ -647,10 +654,10 @@ def kmeans( random_state=random_state, copy_x=True, algorithm=algorithm, - ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=input_matrix.index).astype( - "category" - ) + ).fit(input_matrix_for_vectorization) + return pd.Series( + kmeans.predict(input_matrix_for_vectorization), index=input_matrix.index + ).astype("category") def dbscan( @@ -685,7 +692,7 @@ def dbscan( Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -724,7 +731,8 @@ def dbscan( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"]) + >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", + ... "football, fun, sports", "music, enjoy, guitar"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) 0 0 @@ -795,7 +803,7 @@ def meanshift( Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -892,7 +900,7 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. @@ -932,6 +940,8 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser ) # Can handle sparse input. if isDocumentTermDF: - return pd.DataFrame.sparse.from_spmatrix(result, input_matrix.index, input_matrix.columns) + return pd.DataFrame.sparse.from_spmatrix( + result, input_matrix.index, input_matrix.columns + ) else: return pd.Series(list(result), index=input_matrix.index) From 00ff777aa3197e482fddf980b0f2a354bf1c584d Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 7 Sep 2020 21:19:22 +0200 Subject: [PATCH 16/23] updated name of the function --- texthero/representation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 2223a981..032daa99 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -27,7 +27,7 @@ """ -def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: +def _check_is_valid_DataFrame(df: Union[pd.DataFrame, pd.Series]) -> bool: """ Check if the given Pandas Series is a Document Term DF. @@ -304,7 +304,7 @@ def pca( input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: """ - Perform principal component analysis on the given Pandas Series. + Perform principal component analysis on the given input. Principal Component Analysis (PCA) is a statistical method that is used to reveal where the variance in a dataset comes from. For textual data, @@ -367,7 +367,7 @@ def pca( """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - if _check_is_valid_DocumentTermDF(input_matrix): + if _check_is_valid_DataFrame(input_matrix): values = input_matrix.values else: values = list(input_matrix) @@ -439,7 +439,7 @@ def nmf( """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - if _check_is_valid_DocumentTermDF(input_matrix): + if _check_is_valid_DataFrame(input_matrix): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -544,7 +544,7 @@ def tsne( n_jobs=n_jobs, ) - if _check_is_valid_DocumentTermDF(input_matrix): + if _check_is_valid_DataFrame(input_matrix): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -641,7 +641,7 @@ def kmeans( """ - if _check_is_valid_DocumentTermDF(input_matrix): + if _check_is_valid_DataFrame(input_matrix): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -751,7 +751,7 @@ def dbscan( """ - if _check_is_valid_DocumentTermDF(input_matrix): + if _check_is_valid_DataFrame(input_matrix): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -862,7 +862,7 @@ def meanshift( """ - if _check_is_valid_DocumentTermDF(input_matrix): + if _check_is_valid_DataFrame(input_matrix): vectors = input_matrix.values else: vectors = list(input_matrix) @@ -927,7 +927,7 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser `Norm on Wikipedia `_ """ - isDocumentTermDF = _check_is_valid_DocumentTermDF(input_matrix) + isDocumentTermDF = _check_is_valid_DataFrame(input_matrix) if isDocumentTermDF: input_matrix_coo = input_matrix.sparse.to_coo() From e8d18166035db18b588b4f203f332c860aceab0f Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 7 Sep 2020 21:32:56 +0200 Subject: [PATCH 17/23] added some docstring improvements --- texthero/representation.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 032daa99..0046065d 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -60,9 +60,9 @@ def count( """ Represent a text-based Pandas Series using count. - Return a Document Term DataFrame with the - number of occurences of a document's words for every - document. + Rows of the returned DataFrame represent document whereas columns are terms. + The value in the cell document-term is the number of the term in + this document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -134,7 +134,7 @@ def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, ) -> pd.DataFrame: """ - Return a Term Frequenzy document-term DataFrame based on the given Pandas Series + Represent a text-based Pandas Series using Term Frequency. Rows of the returned DataFrame represent document whereas columns are terms. The value in the cell document-term is the frequency of the term in @@ -207,6 +207,10 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram """ Represent a text-based Pandas Series using TF-IDF. + Rows of the returned DataFrame represent document whereas columns are terms. + The value in the cell document-term is the tfidf-value of the term in + this document. The output is sparse. + *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to calculate the _relative importance_ of the words in a document, taking into account the words' occurences in other documents. It consists of two From 3ba2ebc8d5259d7cedc1aa59707f909d66a3bd5f Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 9 Sep 2020 22:43:09 +0200 Subject: [PATCH 18/23] edited docstrings and DocumentTermDF --- texthero/representation.py | 336 ++++++++++++++++++++----------------- 1 file changed, 183 insertions(+), 153 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 0046065d..f222e40c 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1,5 +1,6 @@ """ -Map words into vectors using different algorithms such as TF-IDF, word2vec or GloVe. +Map words into vectors using different algorithms such as +TF-IDF, word2vec or GloVe. """ import pandas as pd @@ -29,9 +30,9 @@ def _check_is_valid_DataFrame(df: Union[pd.DataFrame, pd.Series]) -> bool: """ - Check if the given Pandas Series is a Document Term DF. + Check if the given Pandas Series is a DataFrame without Multicolumns. - Returns true if input is Document Term DF, else False. + Returns true if input is a DataFrame without Multicolumns, else False. """ return isinstance(df, pd.DataFrame) and not isinstance(df.columns, pd.MultiIndex) @@ -39,9 +40,10 @@ def _check_is_valid_DataFrame(df: Union[pd.DataFrame, pd.Series]) -> bool: # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( - "It seems like the given Pandas Series s is not tokenized. This function will" - " tokenize it automatically using hero.tokenize(s) first. You should consider" - " tokenizing it yourself first with hero.tokenize(s) in the future." + "It seems like the given Pandas Series s is not tokenized. This" + " function will tokenize it automatically using hero.tokenize(s)" + " first. You should consider tokenizing it yourself first with" + " hero.tokenize(s) in the future." ) @@ -60,9 +62,9 @@ def count( """ Represent a text-based Pandas Series using count. - Rows of the returned DataFrame represent document whereas columns are terms. - The value in the cell document-term is the number of the term in - this document. The output is sparse. + Rows of the returned DataFrame represent document whereas + columns are terms. The value in the cell document-term is + the number of the term inthis document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -73,21 +75,21 @@ def count( s : Pandas Series (tokenized) max_features : int, optional, default to None. - Maximum number of features to keep. Will keep all features if set to - None. + Maximum number of features to keep. Will keep all features if + set to None. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. - If float, the parameter represents a proportion of documents, integer - absolute counts. + If float, the parameter represents a proportion of documents, + integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 - Ignore terms that have a document frequency (number of documents they - appear in) frequency strictly higher than the given threshold. - If float, the parameter represents a proportion of documents, integer - absolute counts. + Ignore terms that have a document frequency (number of documents + they appear in) frequency strictly higher than the given + threshold. If float, the parameter represents a proportion of + documents, integer absolute counts. binary : bool, default=False If True, all non zero counts are set to 1. @@ -96,7 +98,9 @@ def count( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) + >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe( + ... hero.tokenize + ... ) >>> hero.count(s) # doctest: +SKIP Sentence one two 0 1 1 0 @@ -105,7 +109,7 @@ def count( See Also -------- - Document Term DataFrame: TODO add tutorial link + DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -136,9 +140,9 @@ def term_frequency( """ Represent a text-based Pandas Series using Term Frequency. - Rows of the returned DataFrame represent document whereas columns are terms. - The value in the cell document-term is the frequency of the term in - this document. The output is sparse. + Rows of the returned DataFrame represent document whereas columns are + terms. The value in the cell document-term is the frequency of the + term in this document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -149,27 +153,29 @@ def term_frequency( s : Pandas Series (tokenized) max_features : int, optional, default to None. - Maximum number of features to keep. Will keep all features if set to - None. + Maximum number of features to keep. Will keep all features if + set to None. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. - If float, the parameter represents a proportion of documents, integer - absolute counts. + If float, the parameter represents a proportion of documents, + integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 - Ignore terms that have a document frequency (number of documents they - appear in) frequency strictly higher than the given threshold. - If float, the parameter represents a proportion of documents, integer - absolute counts. + Ignore terms that have a document frequency (number of documents + they appear in) frequency strictly higher than the given + threshold. If float, the parameter represents a proportion of + documents, integer absolute counts. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) + >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe( + ... hero.tokenize + ... ) >>> hero.term_frequency(s) # doctest: +SKIP Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -177,7 +183,7 @@ def term_frequency( See Also -------- - Document Term DataFrame: TODO add tutorial link + DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -207,17 +213,18 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram """ Represent a text-based Pandas Series using TF-IDF. - Rows of the returned DataFrame represent document whereas columns are terms. - The value in the cell document-term is the tfidf-value of the term in - this document. The output is sparse. + Rows of the returned DataFrame represent document whereas columns are + terms. The value in the cell document-term is the tfidf-value of the + term in this document. The output is sparse. *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to calculate the _relative importance_ of the words in a document, taking - into account the words' occurences in other documents. It consists of two - parts: + into account the words' occurences in other documents. It consists of + two parts: - The *term frequency (tf)* tells us how frequently a term is present in a - document, so tf(document d, term t) = number of times t appears in d. + The *term frequency (tf)* tells us how frequently a term is present + in a document, so tf(document d, term t) = number of times t appears + in d. The *inverse document frequency (idf)* measures how _important_ or _characteristic_ a term is among the whole corpus (i.e. among all @@ -226,11 +233,11 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram Finally, tf-idf(document d, term t) = tf(d, t) * idf(t). - Different from the `sklearn-implementation of - tfidf `, this function does *not* normalize - the output in any way, so the result is exactly what you get applying the - formula described above. + Different from the `sklearn-implementation of tfidf + `, this function does *not* + normalize the output in any way, so the result is exactly what you + get applying the formula described above. Return a Document Term DataFrame with the tfidf of every word in the document. The output is sparse. @@ -250,15 +257,15 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. - If float, the parameter represents a proportion of documents, integer - absolute counts. + If float, the parameter represents a proportion of documents, + integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. - This arguments basically permits to remove corpus-specific stop words. - If float, the parameter represents a proportion of documents, integer - absolute counts. + This arguments basically permits to remove corpus-specific stop + words. If float, the parameter represents a proportion of documents, + integer absolute counts. Examples -------- @@ -274,7 +281,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram -------- `TF-IDF on Wikipedia `_ - Document Term DataFrame: TODO add tutorial link + DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -310,44 +317,45 @@ def pca( """ Perform principal component analysis on the given input. - Principal Component Analysis (PCA) is a statistical method that is used - to reveal where the variance in a dataset comes from. For textual data, - one could for example first represent a Series of documents using - :meth:`texthero.representation.tfidf` to get a vector representation of - each document. Then, PCA can generate new vectors from the tfidf - representation that showcase the differences among the documents most - strongly in fewer dimensions. + Principal Component Analysis (PCA) is a statistical method that is + used to reveal where the variance in a dataset comes from. For + textual data, one could for example first represent a Series of + documents using :meth:`texthero.representation.tfidf` to get a vector + representation of each document. Then, PCA can generate new vectors + from the tfidf representation that showcase the differences among + the documents most strongly in fewer dimensions. For example, the tfidf vectors will have length 100 if hero.tfidf was - called on a large corpus with max_features=100. Visualizing 100 dimensions - is hard! Using PCA with n_components=3, every document will now get a - vector of length 3, and the vectors will be chosen so that the document - differences are easily visible. The corpus can now be visualized in 3D and - we can get a good first view of the data! + called on a large corpus with max_features=100. Visualizing 100 + dimensions is hard! Using PCA with n_components=3, every document will + now get a vector of length 3, and the vectors will be chosen so that + the document differences are easily visible. The corpus can now be + visualized in 3D and we can get a good first view of the data! In general, *pca* should be called after the text has already been represented to a matrix form. PCA cannot directly handle sparse input, so when calling pca on a - DocumentTermDF, the input has to be expanded which can lead to + sparse DataFrame, the input has to be expanded which can lead to memory problems with big datasets. Parameters ---------- - input_matrix : Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). If n_components is not set or None, all components are kept. random_state : int, default=None - Pass an int for reproducible results across multiple function calls. + Pass an int for reproducible results across multiple function + calls. Returns ------- - Pandas Series with the vector calculated by PCA for the document in every - cell. + Pandas Series with the vector calculated by PCA for the document in + every cell. Examples -------- @@ -366,7 +374,8 @@ def pca( See also -------- - `PCA on Wikipedia `_ + `PCA on Wikipedia + `_ """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) @@ -393,17 +402,18 @@ def nmf( Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first representation - function that assigns a scalar (a weight) to each word), NMF will find - n_components many topics (clusters) and calculate a vector for each - document that places it correctly among the topics. + :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each + word), NMF will find n_components many topics (clusters) and + calculate a vector for each document that places it correctly among + the topics. NMF can directly handle sparse input, so when calling nmf on a - DocumentTermDF, the advantage of sparseness is kept. + sparse DataFrame, the advantage of sparseness is kept. Parameters ---------- - input_matrix : Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -414,16 +424,18 @@ def nmf( Returns ------- - Pandas Series with the vector calculated by NMF for the document in every - cell. + Pandas Series with the vector calculated by NMF for the document in + every cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", - ... "Football, Music"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) + >>> s = pd.Series(["Football, Sports, Soccer", + ... "Music, Violin, Orchestra", "Football, Music"]) + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( + ... hero.term_frequency + ... ) >>> hero.nmf(s) # doctest: +SKIP 0 [0.9080190347553924, 0.0] 1 [0.0, 0.771931061231598] @@ -468,21 +480,23 @@ def tsne( Performs TSNE on the given pandas series. t-distributed Stochastic Neighbor Embedding (t-SNE) is - a machine learning algorithm used to visualize high-dimensional data in - fewer dimensions. In natural language processing, the high-dimensional data - is usually a document-term matrix (so in texthero usually a Series after - applying :meth:`texthero.representation.tfidf` or some other first + a machine learning algorithm used to visualize high-dimensional data + in fewer dimensions. In natural language processing, the + high-dimensional data is usually a document-term matrix (so in + texthero usually a Series after applying + :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each word) - that is hard to visualize as there might be many terms. With t-SNE, every - document gets a new, low-dimensional (n_components entries) vector in such - a way that the differences / similarities between documents are preserved. + that is hard to visualize as there might be many terms. With t-SNE, + every document gets a new, low-dimensional (n_components entries) + vector in such a way that the differences / similarities between + documents are preserved. T-SNE can directly handle sparse input, so when calling tsne on a - DocumentTermDF, the advantage of sparseness is kept. + sparse DataFrame, the advantage of sparseness is kept. Parameters ---------- - input_matrix : Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix : Pandas Series (VectorSeries) or DataFrame n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -496,20 +510,21 @@ def tsne( different results. learning_rate : float, optional (default: 200.0) - The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If - the learning rate is too high, the data may look like a 'ball' with any - point approximately equidistant from its nearest neighbours. If the - learning rate is too low, most points may look compressed in a dense - cloud with few outliers. If the cost function gets stuck in a bad local - minimum increasing the learning rate may help. + The learning rate for t-SNE is usually in the range + [10.0, 1000.0]. If the learning rate is too high, the data may + look like a 'ball' with any point approximately equidistant from + its nearest neighbours. If the learning rate is too low, most + points may look compressed in a dense cloud with few outliers. If + the cost function gets stuck in a bad local minimum increasing the + learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. random_state : int, default=None - Determines the random number generator. Pass an int for reproducible - results across multiple function calls. + Determines the random number generator. Pass an int for + reproducible results across multiple function calls. n_jobs : int, optional, default=-1 The number of parallel jobs to run for neighbors search. @@ -517,15 +532,15 @@ def tsne( Returns ------- - Pandas Series with the vector calculated by t-SNE for the document in every - cell. + Pandas Series with the vector calculated by t-SNE for the document in + every cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", - ... "Football, Music"]) + >>> s = pd.Series(["Football, Sports, Soccer", + ... "Music, Violin, Orchestra", "Football, Music"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.tsne(s, random_state=42) # doctest: +SKIP 0 [-18.833383560180664, -276.800537109375] @@ -585,16 +600,17 @@ def kmeans( Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first representation - function that assigns a scalar (a weight) to each word), K-means will find - k topics (clusters) and assign a topic to each document. + :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each + word), K-means will find k topics (clusters) and assign a topic to + each document. Kmeans can directly handle sparse input, so when calling kmeans on a - DocumentTermDF, the advantage of sparseness is kept. + sparse DataFrame, the advantage of sparseness is kept. Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -609,26 +625,30 @@ def kmeans( single run. random_state : int, default=None - Determines random number generation for centroid initialization. Use - an int to make the randomness deterministic. + Determines random number generation for centroid initialization. + Use an int to make the randomness deterministic. algorithm : {"auto", "full", "elkan"}, default="auto" - K-means algorithm to use. The classical EM-style algorithm is "full". - The "elkan" variation is more efficient on data with well-defined - clusters, by using the triangle inequality. However it's more memory - intensive. + K-means algorithm to use. The classical EM-style algorithm is + "full". The "elkan" variation is more efficient on data with + well-defined clusters, by using the triangle inequality. However + it's more memory intensive. Returns ------- - Pandas Series with the cluster the document was assigned to in each cell. + Pandas Series with the cluster the document was assigned to in each + cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", + >>> s = pd.Series(["Football, Sports, Soccer", + ... "music, violin, orchestra", ... "football, fun, sports", "music, fun, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( + ... hero.term_frequency + ... ) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 1 0 @@ -641,7 +661,8 @@ def kmeans( See also -------- - `kmeans on Wikipedia `_ + `kmeans on Wikipedia + `_ """ @@ -687,31 +708,34 @@ def dbscan( Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first representation - function that assigns a scalar (a weight) to each word), DBSCAN will find - topics (clusters) and assign a topic to each document. + :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each + word), DBSCAN will find topics (clusters) and assign a topic to + each document. DBSCAN can directly handle sparse input, so when calling dbscan on a - DocumentTermDF, the advantage of sparseness is kept. + sparse DataFrame, the advantage of sparseness is kept. Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame eps : float, default=0.5 The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most - important DBSCAN parameter to choose appropriately for your data set - and distance function. + important DBSCAN parameter to choose appropriately for your data + set and distance function. min_samples : int, default=5 - The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. This includes the point itself. + The number of samples (or total weight) in a neighborhood for a + point to be considered as a core point. This includes the point + itself. metric : string, or callable, default='euclidean' The metric to use when calculating distance between instances in a - feature array. Use `sorted(sklearn.neighbors.VALID_METRICS['brute'])` + feature array. Use + `sorted(sklearn.neighbors.VALID_METRICS['brute'])` to see valid options. metric_params : dict, default=None @@ -729,13 +753,15 @@ def dbscan( Returns ------- - Pandas Series with the cluster the document was assigned to in each cell. + Pandas Series with the cluster the document was assigned to in each + cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", + >>> s = pd.Series(["Football, Sports, Soccer", + ... "music, violin, orchestra", ... "football, fun, sports", "music, enjoy, guitar"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) @@ -797,41 +823,42 @@ def meanshift( Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first representation - function that assigns a scalar (a weight) to each word), mean shift will - find topics (clusters) and assign a topic to each document. + :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each + word), mean shift will find topics (clusters) and assign a topic + to each document. - Menashift cannot directly handle sparse input, so when calling meanshift on a - DocumentTermDF, the input has to be expanded which can lead to - memory problems with big datasets. + Menashift cannot directly handle sparse input, so when calling + meanshift on a sparse DataFrame, the input has to be expanded + which can lead to memory problems with big datasets. Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame bandwidth : float, default=None Bandwidth used in the RBF kernel. If not given, the bandwidth is estimated. Estimating takes time at least quadratic in the number of samples - (i.e. documents). For large datasets, it’s wise to set the bandwidth - to a small value. + (i.e. documents). For large datasets, it’s wise to set the + bandwidth to a small value. bin_seeding : bool, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness - corresponds to the bandwidth. Setting this option to True will speed - up the algorithm because fewer seeds will be initialized. + corresponds to the bandwidth. Setting this option to True will + speed up the algorithm because fewer seeds will be initialized. min_bin_freq : int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. cluster_all : bool, default=True - If true, then all points are clustered, even those orphans that are - not within any kernel. Orphans are assigned to the nearest kernel. - If false, then orphans are given cluster label -1. + If true, then all points are clustered, even those orphans that + are not within any kernel. Orphans are assigned to the nearest + kernel. If false, then orphans are given cluster label -1. n_jobs : int, default=-1 The number of jobs to use for the computation. @@ -839,11 +866,13 @@ def meanshift( max_iter : int, default=300 Maximum number of iterations, per seed point before the clustering - operation terminates (for that seed point), if has not converged yet. + operation terminates (for that seed point), if has not converged + yet. Returns ------- - Pandas Series with the cluster the document was assigned to in each cell. + Pandas Series with the cluster the document was assigned to in each + cell. Examples -------- @@ -899,12 +928,12 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser """ Normalize every cell in a Pandas Series. - Input can be VectorSeries or DocumentTermDF. For DocumentTermDFs, + Input can be VectorSeries or DataFrames. For sparse DataFrames, the sparseness is kept. Parameters ---------- - input_matrix: Pandas Series (VectorSeries) or DataFrame (DocumentTermDF) + input_matrix: Pandas Series (VectorSeries) or DataFrame norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. @@ -913,10 +942,10 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser -------- >>> import texthero as hero >>> import pandas as pd - >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) - >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") # doctest: +SKIP - 0 1 + >>> col = ["a","b","c", "d"] + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], + ... columns=col).astype("Sparse") + >>> hero.normalize(s, norm="max") # doctest: +SKIP a b c d 0 0.250000 0.500000 0.75 1.000000 1 0.571429 0.285714 1.00 0.714286 @@ -928,12 +957,13 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser -------- Representation Series link TODO add link to tutorial - `Norm on Wikipedia `_ + `Norm on Wikipedia + `_ """ - isDocumentTermDF = _check_is_valid_DataFrame(input_matrix) + isDataFrame = _check_is_valid_DataFrame(input_matrix) - if isDocumentTermDF: + if isDataFrame: input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -943,7 +973,7 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser input_matrix_for_vectorization, norm=norm ) # Can handle sparse input. - if isDocumentTermDF: + if isDataFrame: return pd.DataFrame.sparse.from_spmatrix( result, input_matrix.index, input_matrix.columns ) From 111ced6ebfc3fe0ffd4c24725c45f36f2eb6d394 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 9 Sep 2020 22:46:38 +0200 Subject: [PATCH 19/23] uniform docstring --- texthero/representation.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index f222e40c..b8a00541 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -392,7 +392,7 @@ def nmf( input_matrix: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: """ - Performs non-negative matrix factorization. + Performs non-negative matrix factorization on the given input. Non-Negative Matrix Factorization (NMF) is often used in natural language processing to find clusters of similar @@ -477,7 +477,8 @@ def tsne( n_jobs=-1, ) -> pd.Series: """ - Performs TSNE on the given pandas series. + Performs t-Distributed Stochastic Neighbor Embedding on the given + input. t-distributed Stochastic Neighbor Embedding (t-SNE) is a machine learning algorithm used to visualize high-dimensional data @@ -589,7 +590,7 @@ def kmeans( algorithm="auto", ): """ - Performs K-means clustering algorithm. + Performs K-means clustering algorithm on the given input. K-means clustering is used in natural language processing to separate texts into k clusters (groups) @@ -695,7 +696,7 @@ def dbscan( n_jobs=-1, ): """ - Perform DBSCAN clustering. + Perform DBSCAN clustering on the given input. Density-based spatial clustering of applications with noise (DBSCAN) is used in natural language processing @@ -810,7 +811,7 @@ def meanshift( max_iter=300, ): """ - Perform mean shift clustering. + Perform mean shift clustering on the given input. Mean shift clustering is used in natural language processing From b3823e256c1b86267a2183ca8f13560150e8358d Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 9 Sep 2020 22:47:29 +0200 Subject: [PATCH 20/23] formatting done --- texthero/representation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index b8a00541..7e9fdcdd 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -40,7 +40,7 @@ def _check_is_valid_DataFrame(df: Union[pd.DataFrame, pd.Series]) -> bool: # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( - "It seems like the given Pandas Series s is not tokenized. This" + "It seems like the given Pandas Series s is not tokenized. This" " function will tokenize it automatically using hero.tokenize(s)" " first. You should consider tokenizing it yourself first with" " hero.tokenize(s) in the future." From efcf8c04e64d9361d59d0069dcd5bb850b833604 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 12 Sep 2020 10:58:14 +0200 Subject: [PATCH 21/23] Fix small stuff from review. --- texthero/representation.py | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 7e9fdcdd..0a59d05f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -28,16 +28,6 @@ """ -def _check_is_valid_DataFrame(df: Union[pd.DataFrame, pd.Series]) -> bool: - """ - Check if the given Pandas Series is a DataFrame without Multicolumns. - - Returns true if input is a DataFrame without Multicolumns, else False. - - """ - return isinstance(df, pd.DataFrame) and not isinstance(df.columns, pd.MultiIndex) - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This" @@ -64,7 +54,7 @@ def count( Rows of the returned DataFrame represent document whereas columns are terms. The value in the cell document-term is - the number of the term inthis document. The output is sparse. + the number of the term in this document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -109,7 +99,7 @@ def count( See Also -------- - DataFrame: TODO add tutorial link + TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -183,7 +173,7 @@ def term_frequency( See Also -------- - DataFrame: TODO add tutorial link + TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -281,7 +271,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram -------- `TF-IDF on Wikipedia `_ - DataFrame: TODO add tutorial link + TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -380,7 +370,7 @@ def pca( """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - if _check_is_valid_DataFrame(input_matrix): + if isinstance(input_matrix, pd.DataFrame): values = input_matrix.values else: values = list(input_matrix) @@ -455,7 +445,7 @@ def nmf( """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - if _check_is_valid_DataFrame(input_matrix): + if isinstance(input_matrix, pd.DataFrame): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -564,7 +554,7 @@ def tsne( n_jobs=n_jobs, ) - if _check_is_valid_DataFrame(input_matrix): + if isinstance(input_matrix, pd.DataFrame): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -667,7 +657,7 @@ def kmeans( """ - if _check_is_valid_DataFrame(input_matrix): + if isinstance(input_matrix, pd.DataFrame): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -782,7 +772,7 @@ def dbscan( """ - if _check_is_valid_DataFrame(input_matrix): + if isinstance(input_matrix, pd.DataFrame): input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: @@ -896,7 +886,7 @@ def meanshift( """ - if _check_is_valid_DataFrame(input_matrix): + if isinstance(input_matrix, pd.DataFrame): vectors = input_matrix.values else: vectors = list(input_matrix) @@ -962,7 +952,7 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser `_ """ - isDataFrame = _check_is_valid_DataFrame(input_matrix) + isDataFrame = isinstance(input_matrix, pd.DataFrame) if isDataFrame: input_matrix_coo = input_matrix.sparse.to_coo() From 6e0c8315c318ae77d4549ccff9ebf4d0aed3708f Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 12 Sep 2020 12:08:27 +0200 Subject: [PATCH 22/23] incorporate suggested changes --- texthero/representation.py | 42 +++++++++++++++----------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 0a59d05f..a999836c 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -52,7 +52,7 @@ def count( """ Represent a text-based Pandas Series using count. - Rows of the returned DataFrame represent document whereas + Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the number of the term in this document. The output is sparse. TODO add tutorial link @@ -64,7 +64,7 @@ def count( ---------- s : Pandas Series (tokenized) - max_features : int, optional, default to None. + max_features : int, optional, default=None. Maximum number of features to keep. Will keep all features if set to None. @@ -88,13 +88,11 @@ def count( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe( - ... hero.tokenize - ... ) - >>> hero.count(s) # doctest: +SKIP - Sentence one two - 0 1 1 0 - 1 1 0 1 + >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) + >>> hero.count(s) + Sentence one two + 0 1 1 0 + 1 1 0 1 See Also -------- @@ -130,7 +128,7 @@ def term_frequency( """ Represent a text-based Pandas Series using Term Frequency. - Rows of the returned DataFrame represent document whereas columns are + Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the frequency of the term in this document. The output is sparse. TODO add tutorial link @@ -142,7 +140,7 @@ def term_frequency( ---------- s : Pandas Series (tokenized) - max_features : int, optional, default to None. + max_features : int, optional, default=None. Maximum number of features to keep. Will keep all features if set to None. @@ -163,9 +161,7 @@ def term_frequency( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe( - ... hero.tokenize - ... ) + >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) # doctest: +SKIP Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -203,7 +199,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram """ Represent a text-based Pandas Series using TF-IDF. - Rows of the returned DataFrame represent document whereas columns are + Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the tfidf-value of the term in this document. The output is sparse. @@ -229,10 +225,6 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram normalize the output in any way, so the result is exactly what you get applying the formula described above. - Return a Document Term DataFrame with the - tfidf of every word in the document. The output is sparse. - TODO add tutorial link - The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. @@ -240,7 +232,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram ---------- s : Pandas Series (tokenized) - max_features : int, optional, default to None. + max_features : int, optional, default=None. If not None, only the max_features most frequent tokens are used. min_df : float in range [0.0, 1.0] or int, default=1 @@ -333,7 +325,7 @@ def pca( ---------- input_matrix : Pandas Series (VectorSeries) or DataFrame - n_components : Int. Default is 2. + n_components : Int. default=2. Number of components to keep (dimensionality of output vectors). If n_components is not set or None, all components are kept. @@ -405,7 +397,7 @@ def nmf( ---------- input_matrix : Pandas Series (VectorSeries) or DataFrame - n_components : Int. Default is 2. + n_components : Int. default=2. Number of components to keep (dimensionality of output vectors). If n_components is not set or None, all components are kept. @@ -489,7 +481,7 @@ def tsne( ---------- input_matrix : Pandas Series (VectorSeries) or DataFrame - n_components : int, default is 2. + n_components : int, default=2. Number of components to keep (dimensionality of output vectors). If n_components is not set or None, all components are kept. @@ -603,7 +595,7 @@ def kmeans( ---------- input_matrix: Pandas Series (VectorSeries) or DataFrame - n_clusters: Int, default to 5. + n_clusters: Int, default=5. The number of clusters to separate the data into. n_init : int, default=10 @@ -926,7 +918,7 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser ---------- input_matrix: Pandas Series (VectorSeries) or DataFrame - norm: str, default to "l2" + norm: str, default="l2" One of "l1", "l2", or "max". The norm that is used. Examples From 3f8b734ca256819fb7318cdf8fc2afb99fc0dcb8 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 12 Sep 2020 12:17:44 +0200 Subject: [PATCH 23/23] re-skip doctest as it fails on macOS --- texthero/representation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index a999836c..5d610134 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -89,7 +89,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) + >>> hero.count(s) # doctest: +SKIP Sentence one two 0 1 1 0 1 1 0 1