From fa342a92d4f007cebfce29f1f22a4e31fedc56c6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Tue, 18 Aug 2020 22:06:14 +0200 Subject: [PATCH 01/22] added MultiIndex DF support suport MultiIndex as function parameter returns MultiIndex, where Representation was returned * missing: correct test Co-authored-by: Henri Froese <hf2000510@gmail.com> --- tests/test_indexes.py | 18 +-- tests/test_representation.py | 63 +------- texthero/representation.py | 294 +++++++++++++---------------------- texthero/visualization.py | 4 +- 4 files changed, 115 insertions(+), 264 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af7afcd2 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,21 +56,9 @@ ] test_cases_representation = [ - [ - "count", - lambda x: representation.flatten(representation.count(x)), - (s_tokenized_lists,), - ], - [ - "term_frequency", - lambda x: representation.flatten(representation.term_frequency(x)), - (s_tokenized_lists,), - ], - [ - "tfidf", - lambda x: representation.flatten(representation.tfidf(x)), - (s_tokenized_lists,), - ], + ["count", representation.count, (s_tokenized_lists,),], + ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], + ["tfidf", representation.tfidf, (s_tokenized_lists,),], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..41b81ffa 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,16 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = pd.MultiIndex.from_tuples( - [(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")], -) - -s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples( - [(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")], -) - -s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_tokenized_output_index = [0,1] +s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -182,55 +175,3 @@ def test_tfidf_formula(self): ).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true) - - """ - flatten. - """ - - def test_flatten(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s), s_true, check_names=False - ) - - def test_flatten_fill_missing_with(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]], - index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, fill_missing_with="FILLED"), - s_true, - check_names=False, - ) - - def test_flatten_missing_row(self): - # Simulating a row with no features, so it's completely missing from - # the representation series. - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]], - index=["doc0", "doc1", "doc2"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, index=s_true.index), s_true, check_names=False - ) diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..042db71a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -27,90 +27,14 @@ """ -def flatten( - s: Union[pd.Series, pd.Series.sparse], - index: pd.Index = None, - fill_missing_with: Any = 0.0, -) -> pd.Series: - """ - Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series. - - The given Series should have a multiindex with first level being the document - and second level being individual features of that document (e.g. tdidf scores per word). - The flattened Series has one cell per document, with the cell being a list of all - the individual features of that document. - - Parameters - ---------- - s : Sparse Pandas Series or Pandas Series - The multiindexed Pandas Series to flatten. - - index : Pandas Index, optional, default to None - The index the flattened Series should have. - - fill_missing_with : Any, default to 0.0 - Value to fill the NaNs (missing values) with. This _does not_ mean - that existing values that are np.nan are replaced, but rather that - features that are not present in one document but present in others - are filled with fill_missing_with. See example below. - - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> import numpy as np - >>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word']) - >>> s = pd.Series([3, np.nan, 4], index=index) - >>> s - document word - doc0 Word1 3.0 - Word3 NaN - doc1 Word2 4.0 - dtype: float64 - >>> hero.flatten(s, fill_missing_with=0.0) - document - doc0 [3.0, 0.0, nan] - doc1 [0.0, 4.0, 0.0] - dtype: object - - """ - s = s.unstack(fill_value=fill_missing_with) - - if index is not None: - s = s.reindex(index, fill_value=fill_missing_with) - # Reindexing makes the documents for which no values - # are present in the Sparse Representation Series - # "reappear" correctly. - - s = pd.Series(s.values.tolist(), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: +def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: """ - Check if the given Pandas Series is a Document Representation Series. + Check if the given Pandas Series is a Document Term DF. - Returns true if Series is Document Representation Series, else False. + Returns true if input is Document Term DF, else False. """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True + return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -132,11 +56,11 @@ def count( min_df=1, max_df=1.0, binary=False, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using count. - Return a Document Representation Series with the + Return a Document Term DataFrame with the number of occurences of a document's words for every document. TODO add tutorial link @@ -144,10 +68,6 @@ def count( The input Series should already be tokenized. If not, it will be tokenized before count is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -177,15 +97,14 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - 0 Sentence 1 - one 1 - 1 Sentence 1 - two 1 - dtype: Sparse[int64, 0] + count + Sentence one two + 0 1 1 0 + 1 1 0 1 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -204,25 +123,23 @@ def count( ) tf_vectors_csr = tf.fit_transform(s) - tf_vectors_coo = coo_matrix(tf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tf_vectors_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("count", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tf_vectors_csr, s.index, multiindexed_columns + ) def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. - Return a Document Representation Series with the + Return a Document Term DataFrame with the term frequencies of the terms for every document. TODO add tutorial link @@ -230,11 +147,6 @@ def term_frequency( The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - - Parameters ---------- s : Pandas Series (tokenized) @@ -261,16 +173,14 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) - 0 Sentence 0.2 - hey 0.2 - one 0.2 - 1 Sentence 0.2 - two 0.2 - dtype: Sparse[float64, nan] + term_frequency + Sentence hey one two + 0 0.2 0.2 0.2 0.0 + 1 0.2 0.0 0.0 0.2 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -291,17 +201,16 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - s_out = pd.Series.sparse.from_coo(frequency_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("term_frequency", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + frequency_coo, s.index, multiindexed_columns + ) -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: +def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -324,20 +233,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: so the result is exactly what you get applying the formula described above. - Return a Document Representation Series with the + Return a Document Term DataFrame with the tfidf of every word in the document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. - If working with big pandas Series, you might want to limit - the number of features through the max_features parameter. - - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -365,17 +267,16 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) - 0 Bye 1.000000 - Hi 1.405465 - 1 Bye 2.000000 - Test 1.405465 - dtype: Sparse[float64, nan] + tfidf + Bye Hi Test + 0 1.0 1.405465 0.000000 + 1 2.0 0.000000 1.405465 See Also -------- `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_ - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -395,16 +296,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: tfidf_vectors_csr = tfidf.fit_transform(s) - # Result from sklearn is in Compressed Sparse Row format. - # Pandas Sparse Series can only be initialized from Coordinate format. - tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) - - # Map word index to word name and keep original index of documents. - feature_names = tfidf.get_feature_names() - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("tfidf", word) for word in tfidf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tfidf_vectors_csr, s.index, multiindexed_columns + ) """ @@ -412,7 +310,9 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: """ -def pca(s, n_components=2, random_state=None) -> pd.Series: +def pca( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Perform principal component analysis on the given Pandas Series. @@ -434,7 +334,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or MuliIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -468,10 +368,18 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.values + else: + values = list(s) + + return pd.Series(pca.fit_transform(values).tolist(), index=s.index) -def nmf(s, n_components=2, random_state=None) -> pd.Series: +def nmf( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Performs non-negative matrix factorization. @@ -491,7 +399,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -527,11 +435,17 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) def tsne( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_components=2, perplexity=30.0, learning_rate=200.0, @@ -557,7 +471,7 @@ def tsne( Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -619,7 +533,13 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) """ @@ -628,7 +548,7 @@ def tsne( def kmeans( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_clusters=5, n_init=10, max_iter=300, @@ -653,7 +573,7 @@ def kmeans( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -686,7 +606,7 @@ def kmeans( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 1 0 @@ -702,7 +622,12 @@ def kmeans( `kmeans on Wikipedia <https://en.wikipedia.org/wiki/K-means_clustering>`_ """ - vectors = list(s) + + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + kmeans = KMeans( n_clusters=n_clusters, n_init=n_init, @@ -715,7 +640,7 @@ def kmeans( def dbscan( - s, + s: Union[pd.Series, pd.DataFrame], eps=0.5, min_samples=5, metric="euclidean", @@ -743,7 +668,7 @@ def dbscan( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -783,7 +708,7 @@ def dbscan( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) 0 0 1 1 @@ -801,6 +726,11 @@ def dbscan( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + return pd.Series( DBSCAN( eps=eps, @@ -809,13 +739,13 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") def meanshift( - s, + s: Union[pd.Series, pd.DataFrame], bandwidth=None, bin_seeding=False, min_bin_freq=1, @@ -843,7 +773,7 @@ def meanshift( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -901,6 +831,11 @@ def meanshift( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.values + else: + vectors = list(s) + return pd.Series( MeanShift( bandwidth=bandwidth, @@ -909,7 +844,7 @@ def meanshift( cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") @@ -962,31 +897,18 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: `Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_ """ + isDocumentTermDF = _check_is_valid_DocumentTermDF(s) - is_valid_representation = ( - isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2 - ) - - if not is_valid_representation: - raise TypeError( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - ) - # TODO after merging representation: use _check_is_valid_representation instead - - if pd.api.types.is_sparse(s): - s_coo_matrix = s.sparse.to_coo()[0] + if isDocumentTermDF: + s_for_vectorization = s.sparse.to_coo() else: - s = s.astype("Sparse") - s_coo_matrix = s.sparse.to_coo()[0] - - s_for_vectorization = s_coo_matrix + s_for_vectorization = list(s) result = sklearn_normalize( s_for_vectorization, norm=norm ) # Can handle sparse input. - result_coo = coo_matrix(result) - s_result = pd.Series.sparse.from_coo(result_coo) - s_result.index = s.index - - return s_result + if isDocumentTermDF: + return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) + else: + return pd.Series(result.tolist(), index=s.index) diff --git a/texthero/visualization.py b/texthero/visualization.py index e213285e..2426ab4d 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -63,8 +63,8 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten - >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) + >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.kmeans, n_clusters=2) >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """ From 59a9f8c0df70d8136780b3160bc1d2ca59f48b26 Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Wed, 19 Aug 2020 19:39:30 +0200 Subject: [PATCH 02/22] beginning with tests --- tests/test_representation.py | 147 +++++++++++++++++------------------ texthero/representation.py | 8 +- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 41b81ffa..d4acd369 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,32 +50,84 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0,1] +s_tokenized_output_index = [0, 1] + +s_tokenized_output_index_noncontinous = [5, 7] + + +def _get_multiindex_for_tokenized_output(first_level_name): + return pd.MultiIndex.from_product( + [[first_level_name], ["!", ".", "?", "TEST", "Test"]] + ) -s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [1, 1, 2, 2, 1, 1], "int"], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("count"), + ).astype("Sparse"), + ], [ "term_frequency", representation.term_frequency, - [0.125, 0.125, 0.250, 0.250, 0.125, 0.125], - "float", + pd.DataFrame( + [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("term_frequency"), + ).astype("Sparse"), ], [ "tfidf", representation.tfidf, - [_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index], - "float", + pd.DataFrame( + [ + [ + _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here + for x in ["!", ".", "?", "TEST", "Test"] + ], + [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + ], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("tfidf"), + ).astype("Sparse"), ], ] + test_cases_vectorization_min_df = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [2, 1], "int"], - ["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",], - ["tfidf", representation.tfidf, [2.0, 1.0], "float",], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [2, 1], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("count", "Test")]), + ).astype("Sparse"), + ], + [ + "term_frequency", + representation.term_frequency, + pd.DataFrame( + [0.666667, 0.333333], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + ).astype("Sparse"), + ], + [ + "tfidf", + representation.tfidf, + pd.DataFrame( + [2.0, 1.0], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + ).astype("Sparse"), + ], ] @@ -91,62 +143,23 @@ class AbstractRepresentationTest(PandasTestCase): """ @parameterized.expand(test_cases_vectorization) - def test_vectorization_simple( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="int" - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="float" - ).astype(pd.SparseDtype("float", np.nan)) + def test_vectorization_simple(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( - self, name, test_function, correct_output_values, int_or_float + self, name, test_function, correct_output=None ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - result_s = test_function(s_tokenized_with_noncontinuous_index) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) @parameterized.expand(test_cases_vectorization_min_df) - def test_vectorization_min_df( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - + def test_vectorization_min_df(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -159,19 +172,3 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") - - """ - Individual / special tests. - """ - - def test_tfidf_formula(self): - s = pd.Series(["Hi Bye", "Test Bye Bye"]) - s = preprocessing.tokenize(s) - s_true_index = pd.MultiIndex.from_tuples( - [(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], - ) - s_true = pd.Series( - [_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index - ).astype("Sparse") - - self.assertEqual(representation.tfidf(s), s_true) diff --git a/texthero/representation.py b/texthero/representation.py index 042db71a..efabc9c6 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,11 +97,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - count - Sentence one two + count + Sentence one two 0 1 1 0 1 1 0 1 - +# FIXME columns pandas doctest See Also -------- Document Term DataFrame: TODO add tutorial link @@ -375,7 +375,7 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) - +# FIXME: merge master again def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None From 19c52de3f5ae6a1a01e4262dca00ea5177718311 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Wed, 19 Aug 2020 22:02:41 +0200 Subject: [PATCH 03/22] implemented correct sparse support *missing: test adopting for new types Co-authored-by: Henri Froese <hf2000510@gmail.com> --- tests/test_representation.py | 12 ++++---- texthero/representation.py | 59 +++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index d4acd369..7c02ccd2 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -70,7 +70,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("count"), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -108,7 +108,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("count", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -123,7 +123,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "tfidf", representation.tfidf, pd.DataFrame( - [2.0, 1.0], + [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), ).astype("Sparse"), @@ -146,20 +146,20 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) + pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): diff --git a/texthero/representation.py b/texthero/representation.py index efabc9c6..ff691212 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -101,9 +101,12 @@ def count( Sentence one two 0 1 1 0 1 1 0 1 -# FIXME columns pandas doctest + See Also -------- + + # FIXME columns pandas doctest + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -375,8 +378,11 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + + # FIXME: merge master again + def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -437,11 +443,12 @@ def nmf( nmf = NMF(n_components=n_components, init="random", random_state=random_state,) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) + return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) def tsne( @@ -535,11 +542,12 @@ def tsne( ) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) + return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) """ @@ -624,9 +632,10 @@ def kmeans( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) kmeans = KMeans( n_clusters=n_clusters, @@ -635,8 +644,8 @@ def kmeans( random_state=random_state, copy_x=True, algorithm=algorithm, - ).fit(vectors) - return pd.Series(kmeans.predict(vectors), index=s.index).astype("category") + ).fit(s_for_vectorization) + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") def dbscan( @@ -727,9 +736,10 @@ def dbscan( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) return pd.Series( DBSCAN( @@ -739,7 +749,7 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(vectors), + ).fit_predict(s_for_vectorization), index=s.index, ).astype("category") @@ -877,17 +887,15 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> idx = pd.MultiIndex.from_tuples( - ... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word") - ... ) - >>> s = pd.Series([1, 2, 3, 4], index=idx) + >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") - document word - 0 a 0.50 - b 1.00 - 1 c 0.75 - d 1.00 - dtype: Sparse[float64, nan] + 0 1 + a b c d + 0 0.250000 0.500000 0.75 1.000000 + 1 0.571429 0.285714 1.00 0.714286 + 2 0.400000 0.400000 0.60 1.000000 + 3 0.111111 0.222222 1.00 0.888889 See Also @@ -900,7 +908,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: isDocumentTermDF = _check_is_valid_DocumentTermDF(s) if isDocumentTermDF: - s_for_vectorization = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: s_for_vectorization = list(s) From 41f55a8a359f15ce4ba65e1e726b9e0757fc596b Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Fri, 21 Aug 2020 10:20:02 +0200 Subject: [PATCH 04/22] added back list() and rm .tolist() --- texthero/representation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 048b42ec..025652d9 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,7 +37,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(s.values.tolist(), index=s.index) + s = pd.Series(list(s.values), index=s.index) return s @@ -415,7 +415,7 @@ def pca( else: values = list(s) - return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + return pd.Series(list(pca.fit_transform(values)), index=s.index) # FIXME: merge master again @@ -489,7 +489,7 @@ def nmf( else: s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) def tsne( @@ -589,7 +589,7 @@ def tsne( else: s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) """ @@ -963,4 +963,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(result.tolist(), index=s.index) + return pd.Series(list(result), index=s.index) From 217611a2c648db4044d240a9c12a157b94b36bca Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Fri, 21 Aug 2020 10:21:41 +0200 Subject: [PATCH 05/22] rm .tolist() and added list() --- texthero/representation.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 025652d9..fdab73dd 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,36 +37,6 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(list(s.values), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: - """ - Check if the given Pandas Series is a Document Representation Series. - - Returns true if Series is Document Representation Series, else False. - - """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This function will" @@ -963,4 +933,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(list(result), index=s.index) + return pd.Series((result), index=s.index) From 6a3b56d1a56401880efa7cfa7dd32668e23b25ea Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Fri, 21 Aug 2020 10:41:22 +0200 Subject: [PATCH 06/22] Adopted the test to the new dataframes --- tests/test_representation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 7c02ccd2..3564730e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -90,7 +90,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here for x in ["!", ".", "?", "TEST", "Test"] ], - [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), @@ -146,20 +146,28 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) + pd.testing.assert_series_equal( + pd.Series(s_tokenized_output_index_noncontinous), + pd.Series(result_s.index), + check_dtype=False, + ) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): From b8ff5611e550f5f4bc023b2b76ef8ebcff7f8021 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Fri, 21 Aug 2020 10:41:35 +0200 Subject: [PATCH 07/22] wrong format --- texthero/representation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index fdab73dd..ac0a458f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -657,7 +657,9 @@ def kmeans( copy_x=True, algorithm=algorithm, ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype( + "category" + ) def dbscan( From e3af2f9da094505861cddc420f57490700ca88ef Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Fri, 21 Aug 2020 18:48:51 +0200 Subject: [PATCH 08/22] Address most review comments. --- tests/test_representation.py | 19 ++++++++-------- texthero/representation.py | 42 +++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 3564730e..5f985996 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,9 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0, 1] +s_tokenized_output_index = pd.Index([0, 1]) -s_tokenized_output_index_noncontinous = [5, 7] +s_tokenized_output_index_noncontinous = pd.Index([5, 7]) def _get_multiindex_for_tokenized_output(first_level_name): @@ -79,7 +79,8 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("term_frequency"), - ).astype("Sparse"), + dtype="Sparse", + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -94,7 +95,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -117,7 +118,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [0.666667, 0.333333], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -126,7 +127,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -155,10 +156,8 @@ def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal( - pd.Series(s_tokenized_output_index_noncontinous), - pd.Series(result_s.index), - check_dtype=False, + pd.testing.assert_index_equal( + s_tokenized_output_index_noncontinous, result_s.index ) @parameterized.expand(test_cases_vectorization_min_df) diff --git a/texthero/representation.py b/texthero/representation.py index ac0a458f..7793cb2b 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -145,7 +145,7 @@ def term_frequency( Return a Document Term DataFrame with the term frequencies of the terms for every - document. + document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -241,7 +241,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram formula described above. Return a Document Term DataFrame with the - tfidf of every word in the document. + tfidf of every word in the document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -341,9 +341,13 @@ def pca( In general, *pca* should be called after the text has already been represented to a matrix form. + PCA cannot directly handle sparse input, so when calling pca on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s : Pandas Series or MuliIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -388,9 +392,6 @@ def pca( return pd.Series(list(pca.fit_transform(values)), index=s.index) -# FIXME: merge master again - - def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -410,10 +411,12 @@ def nmf( n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. + NMF can directly handle sparse input, so when calling nmf on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -484,10 +487,12 @@ def tsne( document gets a new, low-dimensional (n_components entries) vector in such a way that the differences / similarities between documents are preserved. + T-SNE can directly handle sparse input, so when calling tsne on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -591,9 +596,12 @@ def kmeans( function that assigns a scalar (a weight) to each word), K-means will find k topics (clusters) and assign a topic to each document. + Kmeans can directly handle sparse input, so when calling kmeans on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -689,9 +697,12 @@ def dbscan( function that assigns a scalar (a weight) to each word), DBSCAN will find topics (clusters) and assign a topic to each document. + DBSCAN can directly handle sparse input, so when calling dbscan on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -795,9 +806,13 @@ def meanshift( function that assigns a scalar (a weight) to each word), mean shift will find topics (clusters) and assign a topic to each document. + Menashift cannot directly handle sparse input, so when calling meanshift on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -889,11 +904,12 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. - Input has to be a Representation Series. + Input can be VectorSeries or DocumentTermDF. For DocumentTermDFs, + the sparseness is kept. Parameters ---------- - s: Pandas Series + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. From 77ad80ecf8977a098b73c4f12c8f28951c769dfc Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Fri, 21 Aug 2020 19:45:48 +0200 Subject: [PATCH 09/22] Add more unittests for representation --- tests/test_representation.py | 118 +++++++++++++++++++++++++++++++++-- texthero/representation.py | 14 ++--- 2 files changed, 118 insertions(+), 14 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 5f985996..2722289e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -132,6 +132,50 @@ def _get_multiindex_for_tokenized_output(first_level_name): ] +s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) +s_documenttermDF = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), +).astype("Sparse[float64, nan]") + + +test_cases_dim_reduction_and_clustering = [ + # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] + ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], + [ + "nmf", + representation.nmf, + pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), + ], + [ + "tsne", + representation.tsne, + pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), + ], + [ + "kmeans", + representation.kmeans, + pd.Series([1, 0], index=[5, 7], dtype="category"), + ], + [ + "dbscan", + representation.dbscan, + pd.Series([-1, -1], index=[5, 7], dtype="category"), + ], + [ + "meanshift", + representation.meanshift, + pd.Series([0, 1], index=[5, 7], dtype="category"), + ], + [ + "normalize", + representation.normalize, + pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), + ], +] + + class AbstractRepresentationTest(PandasTestCase): """ Class for representation test cases. Most tests are @@ -147,9 +191,7 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( @@ -164,9 +206,7 @@ def test_vectorization_noncontinuous_index_kept( def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -179,3 +219,69 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction and Clustering + """ + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_vector_series_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "kmeans": + result_s = test_function(s_vector_series, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_vector_series) + else: + result_s = test_function(s_vector_series, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_documenttermDF_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "normalize": + # testing this below separately + return + + if name == "kmeans": + result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_documenttermDF) + else: + result_s = test_function(s_documenttermDF, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + def test_normalize_documenttermDF_also_as_output(self): + # normalize should also return DocumentTermDF output for DocumentTermDF + # input so we test it separately + result = representation.normalize(s_documenttermDF) + correct_output = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + ) + + pd.testing.assert_frame_equal( + result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, + ) diff --git a/texthero/representation.py b/texthero/representation.py index 7793cb2b..8e876088 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,7 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) + >>> hero.count(s) # doctest: +SKIP count Sentence one two 0 1 1 0 @@ -106,8 +106,6 @@ def count( See Also -------- - # FIXME columns pandas doctest - Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -177,7 +175,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) + >>> hero.term_frequency(s) # doctest: +SKIP term_frequency Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -273,7 +271,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) + >>> hero.tfidf(s) # doctest: +SKIP tfidf Bye Hi Test 0 1.0 1.405465 0.000000 @@ -900,7 +898,7 @@ def meanshift( """ -def normalize(s: pd.Series, norm="l2") -> pd.Series: +def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. @@ -920,7 +918,7 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: >>> import pandas as pd >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") + >>> hero.normalize(s, norm="max") # doctest: +SKIP 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 @@ -951,4 +949,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series((result), index=s.index) + return pd.Series(list(result), index=s.index) From f7eb7c35c906ab23b38314796b3a9eadd91842e1 Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Sat, 22 Aug 2020 11:28:44 +0200 Subject: [PATCH 10/22] - Update _types.py with DocumentTermDF - add functionality for decorator @InputSeries to handle several allowed input types - Add typing decorator/hints to representation.py - add tests for _types DocumentTermDF Co-authored-by: Maximilian Krahn <maximilian.krahn@icloud.com> --- tests/test_types.py | 37 ++++++++-- texthero/_types.py | 142 +++++++++++++++++++++++++------------ texthero/representation.py | 84 +++++++++++++--------- texthero/visualization.py | 4 +- 4 files changed, 179 insertions(+), 88 deletions(-) diff --git a/tests/test_types.py b/tests/test_types.py index 9cb5567b..f7bd26dc 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -73,18 +73,47 @@ def f(s): self.fail("Failed although input type is correct.") def test_inputseries_correct_type_documentrepresentationseries(self): - @_types.InputSeries(_types.RepresentationSeries) + @_types.InputSeries(_types.DocumentTermDF) def f(s): pass try: f( - pd.Series( - [1, 2, 3], - index=pd.MultiIndex.from_tuples( + pd.DataFrame( + [[1, 2, 3]], + columns=pd.MultiIndex.from_tuples( [("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")] ), + dtype="Sparse", ) ) except TypeError: self.fail("Failed although input type is correct.") + + def test_several_possible_types_correct_type(self): + @_types.InputSeries([_types.DocumentTermDF, _types.VectorSeries]) + def f(x): + pass + + try: + f( + pd.DataFrame( + [[1, 2, 3]], + columns=pd.MultiIndex.from_tuples( + [("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")] + ), + dtype="Sparse", + ) + ) + + f(pd.Series([[1.0, 2.0]])) + + except TypeError: + self.fail("Failed although input type is correct.") + + def test_several_possible_types_wrong_type(self): + @_types.InputSeries([_types.DocumentTermDF, _types.VectorSeries]) + def f(x): + pass + + self.assertRaises(TypeError, f, pd.Series([["token", "ized"]])) diff --git a/texthero/_types.py b/texthero/_types.py index 9087bd95..cfba6973 100644 --- a/texthero/_types.py +++ b/texthero/_types.py @@ -11,9 +11,8 @@ The goal is to be able to do something like this: -@OutputSeries(RepresentationSeries) @InputSeries(TokenSeries) -def tfidf(s: TokenSeries) -> RepresentationSeries: +def tfidf(s: TokenSeries) -> DocumentTermDF: ... The decorator (@...) makes python check whether the input is @@ -26,7 +25,7 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: The typing helps the users understand the code more easily as they'll be able to see immediately from the documentation on what types of Series a function operates. This is much more -verbose and clearer than e.g. "tfidf(s: pd.Series) -> pd.Series". +verbose and clearer than e.g. "tfidf(s: pd.Series) -> pd.DataFrame". Note that users can and should of course still simply use ordinary pd.Series objects. The custom types are just subclasses of pd.Series so @@ -43,8 +42,8 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: - TextSeries: cells are text (i.e. strings), e.g. "Test" - TokenSeries: cells are lists of tokens (i.e. lists of strings), e.g. ["word1", "word2"] - VectorSeries: cells are vector representations of text, e.g. [0.25, 0.75] -- RepresentationSeries: Series is multiindexed with level one -being the document, level two being the individual features and their values +- DocumentTermDF: DataFrame is sparse and multiindexed in the columns with every subcolumn + being an individual feature The classes are lightweight subclasses of pd.Series and serve 2 purposes: 1. Good documentation for users through docstring. @@ -55,6 +54,8 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: import functools import pandas as pd +from typing import Tuple + """ The Hero Series classes. @@ -63,13 +64,13 @@ def tfidf(s: TokenSeries) -> RepresentationSeries: # This class is mainly for documentation in the docstring. -class HeroSeries(pd.Series): +class HeroTypes(pd.Series, pd.DataFrame): """ Hero Series Types ================= In texthero, most functions operate on a Pandas Series as input and give a Pandas Series as output. There are currently four - main types of Series' in use, which are supported as classes + main types of Series / DataFrames in use, which are supported as classes by the library: 1. TextSeries: Every cell is a text, i.e. a string. For example, @@ -81,64 +82,67 @@ class HeroSeries(pd.Series): 3. VectorSeries: Every cell is a vector representing text, i.e. a list of floats. For example, `pd.Series([[1.0, 2.0], [3.0]])` is a valid VectorSeries. - 4. RepresentationSeries: Series is multiindexed with level one - being the document, level two being the individual features and their values. + 4. DocumentTermDF: DataFrame is sparse and multiindexed in the columns with every subcolumn + being an individual feature For example, - `pd.Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")]))` - is a valid RepresentationSeries. + `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=pd.MultiIndex.from_tuples([("count", "hi"), ("cound", "servus"), ("doc2", "hola")]))` + is a valid DocumentTermDF. These types of Series are supposed to make using the library easier and more intuitive. For example, if you see a function head ``` - def tfidf(s: TokenSeries) -> RepresentationSeries + def tfidf(s: TokenSeries) -> DocumentTermDF ``` then you know that the function takes a Pandas Series whose cells are lists of strings (tokens) and will - return a Pandas Series whose cells are vectors of floats. + return a sparse Pandas DataFrame where every subcolumn is one feature + (in this case one word). """ @staticmethod - def check_series(): + def check_type(): raise NotImplementedError() # Every Hero Series type has to have this. -class TextSeries(HeroSeries): +class TextSeries(HeroTypes): """ In a TextSeries, every cell has to be a text, i.e. a string. For example, `pd.Series(["test", "test"])` is a valid TextSeries. """ @staticmethod - def check_series(s: pd.Series) -> bool: + def check_type(s: pd.Series) -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a TextSeries. """ error_string = ( - "The input Series should consist only of strings in every cell." - " See help(hero.HeroSeries) for more information." + "should be TextSeries: the input Series should consist only of strings in every cell." + " See help(hero.HeroTypes) for more information." ) - if not isinstance(s.iloc[0], str) or s.index.nlevels != 1: - raise TypeError(error_string) + if not (isinstance(s, pd.Series) and isinstance(s.iloc[0], str)): + return False, error_string + else: + return True, "" -class TokenSeries(HeroSeries): +class TokenSeries(HeroTypes): """ In a TokenSeries, every cell has to be a list of words/tokens, i.e. a list of strings. For example, `pd.Series([["test"], ["token2", "token3"]])` is a valid TokenSeries. """ @staticmethod - def check_series(s: pd.Series) -> bool: + def check_type(s: pd.Series) -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a TokenSeries. """ error_string = ( - "There are non-token cells (every cell should be a list of words/tokens) in the given Series." - " See help(hero.HeroSeries) for more information." + "should be TokenSeries: there are non-token cells (every cell should be a list of words/tokens) in the given Series." + " See help(hero.HeroTypes) for more information." ) def is_list_of_strings(cell): @@ -146,11 +150,13 @@ def is_list_of_strings(cell): cell, (list, tuple) ) - if not is_list_of_strings(s.iloc[0]) or s.index.nlevels != 1: - raise TypeError(error_string) + if not (isinstance(s, pd.Series) and is_list_of_strings(s.iloc[0])): + return False, error_string + else: + return True, "" -class VectorSeries(HeroSeries): +class VectorSeries(HeroTypes): """ In a VectorSeries, every cell is a vector representing text, i.e. a list of numbers. @@ -158,14 +164,14 @@ class VectorSeries(HeroSeries): """ @staticmethod - def check_series(s: pd.Series, input_output="") -> bool: + def check_type(s: pd.Series, input_output="") -> Tuple[bool, str]: """ - Check if a given Pandas Series has the properties of a RepresentationSeries. + Check if a given Pandas Series has the properties of a VectorSeries. """ error_string = ( - "There are non-representation cells (every cell should be a list of floats) in the given Series." - " See help(hero.HeroSeries) for more information." + "should be VectorSeries: there are non-representation cells (every cell should be a list of floats) in the given Series." + " See help(hero.HeroTypes) for more information." ) def is_numeric(x): @@ -177,37 +183,47 @@ def is_numeric(x): return True def is_list_of_numbers(cell): - return all(is_numeric(x) for x in cell) and isinstance(cell, (list, tuple)) + return isinstance(cell, (list, tuple)) and all(is_numeric(x) for x in cell) - if not is_list_of_numbers(s.iloc[0]) or s.index.nlevels != 1: - raise TypeError(error_string) + if not (isinstance(s, pd.Series) and is_list_of_numbers(s.iloc[0])): + return False, error_string + else: + return True, "" -class RepresentationSeries(HeroSeries): +class DocumentTermDF(HeroTypes): """ - A RepresentationSeries is multiindexed with level one - being the document, and level two being the individual features and their values. + A DocumentTermDF is a sparse DataFrame that is + multiindexed in the columns with every subcolumn + being an individual feature. For example, - `pd.Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("doc1", "word1"), ("doc1", "word2"), ("doc2", "word1")]))` - is a valid RepresentationSeries. + `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=pd.MultiIndex.from_tuples([("count", "hi"), ("cound", "servus"), ("doc2", "hola")]))` + is a valid DocumentTermDF. """ @staticmethod - def check_series(s: pd.Series, input_output="") -> bool: + def check_type(df: pd.DataFrame, input_output="") -> Tuple[bool, str]: """ - Check if a given Pandas Series has the properties of a RepresentationSeries. + Check if a given Pandas Series has the properties of a DocumentTermDF. """ error_string = ( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex." - " See help(hero.HeroSeries) for more information." + "should be DocumentTermDF: The input should be a DocumentTermDF, so a DataFrame" + " with a MultiIndex in the columns." + " See help(hero.HeroTypes) for more information." ) - if not isinstance(s.index, pd.MultiIndex) or s.index.nlevels != 2: - raise TypeError(error_string) + if not ( + isinstance(df, pd.DataFrame) + and isinstance(df.columns, pd.MultiIndex) + and pd.api.types.is_sparse(df.dtypes[0]) + ): + return False, error_string + else: + return True, "" -def InputSeries(allowed_hero_series_type): +def InputSeries(allowed_hero_series_types): """ Check if first argument of function has / fulfills type allowed_hero_series_type @@ -223,6 +239,12 @@ def InputSeries(allowed_hero_series_type): >>> # throws a type error with a nice explaination >>> f(pd.Series([["I", "am", "tokenized"]])) >>> # passes + + With several possible types: + + >>> @InputSeries([DocumentTermDF, VectorSeries]) + ... def g(x): + ... pass """ def decorator(func): @@ -230,7 +252,33 @@ def decorator(func): def wrapper(*args, **kwargs): s = args[0] # The first input argument will be checked. # Check if input series can fulfill type. - allowed_hero_series_type.check_series(s) + + # list -> several possible types + if isinstance(allowed_hero_series_types, list): + + # Output of check_type is always Bool, Error_String where the Bool is True + # if the type is fulfilled, else false. + # if no type is fulfilled (so check_type first output is False for all allowed types), + # combine all the error strings to show the user all allowed types in the TypeError. + if not any( + allowed_type.check_type(s)[0] + for allowed_type in allowed_hero_series_types + ): + + error_string = ( + "Possible types:\n\nEither " + + allowed_hero_series_types[0].check_type(s)[1] + ) + + for allowed_type in allowed_hero_series_types[1:]: + error_string += "\n\nOr " + allowed_type.check_type(s)[1] + + raise TypeError(error_string) + + else: # only one possible type + fulfills, error_string = allowed_hero_series_types.check_type(s) + if not fulfills: + raise TypeError(error_string) # If we get here, the type can be fulfilled -> execute function as usual. return func(*args, **kwargs) diff --git a/texthero/representation.py b/texthero/representation.py index 8e876088..380a9ac8 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -14,6 +14,13 @@ from scipy.sparse import coo_matrix from typing import Optional, Union, Any +from texthero._types import ( + TextSeries, + TokenSeries, + VectorSeries, + DocumentTermDF, + InputSeries, +) from texthero import preprocessing @@ -27,16 +34,6 @@ """ -def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: - """ - Check if the given Pandas Series is a Document Term DF. - - Returns true if input is Document Term DF, else False. - - """ - return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This function will" @@ -50,13 +47,14 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: """ +@InputSeries([TokenSeries, TextSeries]) def count( - s: pd.Series, + s: Union[TokenSeries, TextSeries], max_features: Optional[int] = None, min_df=1, max_df=1.0, binary=False, -) -> pd.DataFrame: +) -> DocumentTermDF: """ Represent a text-based Pandas Series using count. @@ -135,9 +133,13 @@ def count( ) +@InputSeries([TokenSeries, TextSeries]) def term_frequency( - s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, -) -> pd.DataFrame: + s: Union[TokenSeries, TextSeries], + max_features: Optional[int] = None, + min_df=1, + max_df=1.0, +) -> DocumentTermDF: """ Represent a text-based Pandas Series using term frequency. @@ -213,7 +215,10 @@ def term_frequency( ) -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: +@InputSeries([TokenSeries, TextSeries]) +def tfidf( + s: Union[TokenSeries, TextSeries], max_features=None, min_df=1, max_df=1.0, +) -> DocumentTermDF: """ Represent a text-based Pandas Series using TF-IDF. @@ -315,9 +320,10 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram """ +@InputSeries([VectorSeries, DocumentTermDF]) def pca( - s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None -) -> pd.Series: + s: Union[VectorSeries, DocumentTermDF], n_components=2, random_state=None +) -> VectorSeries: """ Perform principal component analysis on the given Pandas Series. @@ -382,7 +388,7 @@ def pca( """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - if _check_is_valid_DocumentTermDF(s): + if DocumentTermDF.check_type(s)[0]: values = s.values else: values = list(s) @@ -390,9 +396,10 @@ def pca( return pd.Series(list(pca.fit_transform(values)), index=s.index) +@InputSeries([VectorSeries, DocumentTermDF]) def nmf( - s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None -) -> pd.Series: + s: Union[VectorSeries, DocumentTermDF], n_components=2, random_state=None +) -> VectorSeries: """ Performs non-negative matrix factorization. @@ -454,7 +461,7 @@ def nmf( """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - if _check_is_valid_DocumentTermDF(s): + if DocumentTermDF.check_type(s)[0]: s_coo = s.sparse.to_coo() s_for_vectorization = s_coo.astype("float64") else: @@ -463,15 +470,16 @@ def nmf( return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) +@InputSeries([VectorSeries, DocumentTermDF]) def tsne( - s: Union[pd.Series, pd.DataFrame], + s: Union[VectorSeries, DocumentTermDF], n_components=2, perplexity=30.0, learning_rate=200.0, n_iter=1000, random_state=None, n_jobs=-1, -) -> pd.Series: +) -> VectorSeries: """ Performs TSNE on the given pandas series. @@ -556,7 +564,7 @@ def tsne( n_jobs=n_jobs, ) - if _check_is_valid_DocumentTermDF(s): + if DocumentTermDF.check_type(s)[0]: s_coo = s.sparse.to_coo() s_for_vectorization = s_coo.astype("float64") else: @@ -570,14 +578,15 @@ def tsne( """ +@InputSeries([VectorSeries, DocumentTermDF]) def kmeans( - s: Union[pd.Series, pd.DataFrame], + s: Union[VectorSeries, DocumentTermDF], n_clusters=5, n_init=10, max_iter=300, random_state=None, algorithm="auto", -): +) -> VectorSeries: """ Performs K-means clustering algorithm. @@ -649,7 +658,7 @@ def kmeans( """ - if _check_is_valid_DocumentTermDF(s): + if DocumentTermDF.check_type(s)[0]: s_coo = s.sparse.to_coo() s_for_vectorization = s_coo.astype("float64") else: @@ -668,15 +677,16 @@ def kmeans( ) +@InputSeries([VectorSeries, DocumentTermDF]) def dbscan( - s: Union[pd.Series, pd.DataFrame], + s: Union[VectorSeries, DocumentTermDF], eps=0.5, min_samples=5, metric="euclidean", metric_params=None, leaf_size=30, n_jobs=-1, -): +) -> VectorSeries: """ Perform DBSCAN clustering. @@ -758,7 +768,7 @@ def dbscan( """ - if _check_is_valid_DocumentTermDF(s): + if DocumentTermDF.check_type(s)[0]: s_coo = s.sparse.to_coo() s_for_vectorization = s_coo.astype("float64") else: @@ -777,6 +787,7 @@ def dbscan( ).astype("category") +@InputSeries([VectorSeries, DocumentTermDF]) def meanshift( s: Union[pd.Series, pd.DataFrame], bandwidth=None, @@ -785,7 +796,7 @@ def meanshift( cluster_all=True, n_jobs=-1, max_iter=300, -): +) -> VectorSeries: """ Perform mean shift clustering. @@ -869,7 +880,7 @@ def meanshift( """ - if _check_is_valid_DocumentTermDF(s): + if DocumentTermDF.check_type(s)[0]: vectors = s.values else: vectors = list(s) @@ -898,7 +909,10 @@ def meanshift( """ -def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: +@InputSeries([VectorSeries, DocumentTermDF]) +def normalize( + s: Union[VectorSeries, DocumentTermDF], norm="l2" +) -> Union[VectorSeries, DocumentTermDF]: """ Normalize every cell in a Pandas Series. @@ -929,12 +943,12 @@ def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: See Also -------- - Representation Series link TODO add link to tutorial + DocumentTermDF link TODO add link to tutorial `Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_ """ - isDocumentTermDF = _check_is_valid_DocumentTermDF(s) + isDocumentTermDF = DocumentTermDF.check_type(s)[0] if isDocumentTermDF: s_coo = s.sparse.to_coo() diff --git a/texthero/visualization.py b/texthero/visualization.py index 94556a93..0b3893b2 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -71,11 +71,11 @@ def scatterplot( >>> df["pca"] = ( ... hero.tfidf(df["texts"]) ... .pipe(hero.pca, n_components=3) - ... ) # TODO: when others get Representation Support: remove flatten + ... ) >>> df["topics"] = ( ... hero.tfidf(df["texts"]) ... .pipe(hero.kmeans, n_clusters=2) - ... ) # TODO: when others get Representation Support: remove flatten + ... ) >>> hero.scatterplot(df, col="pca", color="topics", ... hover_data=["texts"]) # doctest: +SKIP """ From 4937a4f15b971edba9a02f83fc72a89120b2e633 Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Sat, 22 Aug 2020 14:39:47 +0200 Subject: [PATCH 11/22] Fix DocumentTermDF example DataFrame column names --- texthero/_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texthero/_types.py b/texthero/_types.py index cfba6973..d3ddd2c3 100644 --- a/texthero/_types.py +++ b/texthero/_types.py @@ -85,7 +85,7 @@ class HeroTypes(pd.Series, pd.DataFrame): 4. DocumentTermDF: DataFrame is sparse and multiindexed in the columns with every subcolumn being an individual feature For example, - `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=pd.MultiIndex.from_tuples([("count", "hi"), ("cound", "servus"), ("doc2", "hola")]))` + `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=pd.MultiIndex.from_tuples([("count", "hi"), ("count", "servus"), ("count", "hola")]))` is a valid DocumentTermDF. These types of Series are supposed to make using the library @@ -197,7 +197,7 @@ class DocumentTermDF(HeroTypes): multiindexed in the columns with every subcolumn being an individual feature. For example, - `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=pd.MultiIndex.from_tuples([("count", "hi"), ("cound", "servus"), ("doc2", "hola")]))` + `pd.DataFrame([[1, 2, 3], [4,5,6]], columns=pd.MultiIndex.from_tuples([("count", "hi"), ("count", "servus"), ("count", "hola")]))` is a valid DocumentTermDF. """ From 5fc720c65086a54b0c66635dd2465ff03f63d007 Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Wed, 26 Aug 2020 13:51:07 +0200 Subject: [PATCH 12/22] Implement hero.describe Co-authored-by: Maximilian Krahm <maximilian.krahn@icloud.com> --- texthero/preprocessing.py | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 0360ab29..0aeb87b4 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -15,6 +15,7 @@ from texthero import stopwords as _stopwords from texthero._types import TokenSeries, TextSeries, InputSeries +from texthero import visualization from typing import List, Callable, Union @@ -959,3 +960,53 @@ def remove_hashtags(s: TextSeries) -> TextSeries: with a custom symbol. """ return replace_hashtags(s, " ") + + +@InputSeries(TextSeries) +def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.Series: + """ + Return general descriptions + """ + # Get values we need for several calculations. + description = {} + s_tokenized = tokenize(s) + has_content_mask = has_content(s) + document_lengths = s_tokenized[has_content_mask].map(lambda x: len(x)) + document_lengths_description = document_lengths.describe() + + # Collect statistics. + description["number of documents"] = len(s.index) + description["number of unique documents"] = len(s.unique()) + description["number of missing documents"] = (~has_content_mask).sum() + description["most common words"] = visualization.top_words(s).index[:10].values + description["most common words excluding stopwords"] = s.pipe(clean).pipe(visualization.top_words).index[:10].values + + description["average document length"] = document_lengths_description["mean"] + description["length of shortest document"] = document_lengths_description["min"] + description["length of longest document"] = document_lengths_description["max"] + description["standard deviation of document lengths"] = document_lengths_description["std"] + description["variance of document lengths"] = document_lengths_description["std"] ** 2 + description["25th percentile document lengths"] = document_lengths_description["25%"] + description["50th percentile document lengths"] = document_lengths_description["50%"] + description["75th percentile document lengths"] = document_lengths_description["75%"] + + # Create output Series. + s_description = pd.Series(description) + + # Potentially add information about label distribution. + if s_labels is not None: + + s_labels_distribution = s_labels.value_counts() / s_labels.value_counts().sum() + + # Put the labels distribution into s_description with multiindex to look nice. + s_labels_distribution.index = pd.MultiIndex.from_product( + [["label distribution"], s_labels_distribution.index.values] + ) + + s_description.index = pd.MultiIndex.from_product( + [s_description.index.values, [""]] + ) + + s_description = pd.concat([s_description, s_labels_distribution]) + + return s_description From 55dcd7fcaeeb11b7527d1589445968d0521b0e32 Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Wed, 26 Aug 2020 14:15:16 +0200 Subject: [PATCH 13/22] Change hero.describe to return DataFrame for pretty-printing in Notebooks --- texthero/preprocessing.py | 65 +++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 0aeb87b4..a1842574 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -963,9 +963,41 @@ def remove_hashtags(s: TextSeries) -> TextSeries: @InputSeries(TextSeries) -def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.Series: +def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.DataFrame: """ - Return general descriptions + Describe a given pandas TextSeries (consisting of strings + in every cell). Additionally gather information + about class labels if they are given in s_labels. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") # doctest: +SKIP + >>> df.head(2) # doctest: +SKIP + text topic + 0 Claxton hunting first major medal\n\nBritish h... athletics + 1 O'Sullivan could run in Worlds\n\nSonia O'Sull... athletics + >>> # Describe both the text and the labels + >>> hero.describe(df["text"], df["topic"]) # doctest: +SKIP + Value + number of documents 737 + number of unique documents 727 + number of missing documents 0 + most common words [the, to, a, in, and, of, for, ", I, is] + most common words excluding stopwords [said, first, england, game, one, year, two, w... + average document length 387.803 + length of shortest document 119 + length of longest document 1855 + standard deviation of document lengths 210.728 + 25th percentile document lengths 241 + 50th percentile document lengths 340 + 75th percentile document lengths 494 + label distribution football 0.359566 + rugby 0.199457 + cricket 0.16825 + athletics 0.137042 + tennis 0.135685 """ # Get values we need for several calculations. description = {} @@ -979,16 +1011,25 @@ def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.Series: description["number of unique documents"] = len(s.unique()) description["number of missing documents"] = (~has_content_mask).sum() description["most common words"] = visualization.top_words(s).index[:10].values - description["most common words excluding stopwords"] = s.pipe(clean).pipe(visualization.top_words).index[:10].values + description["most common words excluding stopwords"] = ( + s.pipe(clean).pipe(visualization.top_words).index[:10].values + ) description["average document length"] = document_lengths_description["mean"] description["length of shortest document"] = document_lengths_description["min"] description["length of longest document"] = document_lengths_description["max"] - description["standard deviation of document lengths"] = document_lengths_description["std"] - description["variance of document lengths"] = document_lengths_description["std"] ** 2 - description["25th percentile document lengths"] = document_lengths_description["25%"] - description["50th percentile document lengths"] = document_lengths_description["50%"] - description["75th percentile document lengths"] = document_lengths_description["75%"] + description[ + "standard deviation of document lengths" + ] = document_lengths_description["std"] + description["25th percentile document lengths"] = document_lengths_description[ + "25%" + ] + description["50th percentile document lengths"] = document_lengths_description[ + "50%" + ] + description["75th percentile document lengths"] = document_lengths_description[ + "75%" + ] # Create output Series. s_description = pd.Series(description) @@ -1009,4 +1050,10 @@ def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.Series: s_description = pd.concat([s_description, s_labels_distribution]) - return s_description + # DataFrame will look much nicer for users when printing. + df_description = pd.DataFrame( + s_description.values, index=s_description.index, columns=["Value"] + ) + df_description.index.name = "Statistic" + + return df_description From f3bbc08dcd984de900d21ad8e90326d43fd4cdc6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Wed, 26 Aug 2020 14:15:48 +0200 Subject: [PATCH 14/22] Auto stash before merge of "hero_describe_function" and "origin/hero_describe_function" --- tests/test_preprocessing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 4ca3ace2..298b856b 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -381,3 +381,12 @@ def test_remove_hashtags(self): s_true = pd.Series("Hi , we will remove you") self.assertEqual(preprocessing.remove_hashtags(s), s_true) + + """ + Test describe DataFrame + """ + def test_describe(self): + df = pd.DataFrame([["Here we go", "sport"],["There football England", "sport"], ["There rugby Australia", "sport"],[np.nan, "music"], ["super good music, like it", pd.NA], [pd.NA, pd.NA], ["This concert was so great", "music"]], columns= ["text", "topics"]) + df_description = preprocessing.describe(df[0], df[1]) + df_true = None + pd.testing.assert_frame_equal(df_true, df_description) \ No newline at end of file From 9e72c850598e80d71fe0f4fcd61769073cc3fd5b Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Wed, 26 Aug 2020 15:34:21 +0200 Subject: [PATCH 15/22] Add tests for hero.describe Co-authored-by: Henri Froese <henri.froese@yahoo.com> --- tests/test_preprocessing.py | 56 ++++++++++++++++++++++++++++++++++--- texthero/preprocessing.py | 4 +-- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 298b856b..84f9071d 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -385,8 +385,56 @@ def test_remove_hashtags(self): """ Test describe DataFrame """ + def test_describe(self): - df = pd.DataFrame([["Here we go", "sport"],["There football England", "sport"], ["There rugby Australia", "sport"],[np.nan, "music"], ["super good music, like it", pd.NA], [pd.NA, pd.NA], ["This concert was so great", "music"]], columns= ["text", "topics"]) - df_description = preprocessing.describe(df[0], df[1]) - df_true = None - pd.testing.assert_frame_equal(df_true, df_description) \ No newline at end of file + df = pd.DataFrame( + [ + ["here here here here go", "sport"], + ["There There There", "sport"], + ["Test, Test, Test, Test, Test, Test, Test, Test", "sport"], + [np.nan, "music"], + ["super super", pd.NA], + [pd.NA, pd.NA], + ["great great great great great", "music"], + ], + columns=["text", "topics"], + ) + df_description = preprocessing.describe(df["text"], df["topics"]) + df_true = pd.DataFrame( + [ + 7, + 7, + 2, + ["Test", "great", "here", "There", "super", "go"], + ["test", "great", "super", "go"], + 6.0, + 2.0, + 15.0, + 5.196152422706632, + 3.0, + 5.0, + 5.0, + 0.6, + 0.4, + ], + columns=["Value"], + index=pd.MultiIndex.from_tuples( + [ + ("number of documents", ""), + ("number of unique documents", ""), + ("number of missing documents", ""), + ("most common words", ""), + ("most common words excluding stopwords", ""), + ("average document length", ""), + ("length of shortest document", ""), + ("length of longest document", ""), + ("standard deviation of document lengths", ""), + ("25th percentile document lengths", ""), + ("50th percentile document lengths", ""), + ("75th percentile document lengths", ""), + ("label distribution", "sport"), + ("label distribution", "music"), + ] + ), + ) + pd.testing.assert_frame_equal(df_description, df_true, check_less_precise=True) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index a1842574..333703e2 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -1010,9 +1010,9 @@ def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.DataFrame: description["number of documents"] = len(s.index) description["number of unique documents"] = len(s.unique()) description["number of missing documents"] = (~has_content_mask).sum() - description["most common words"] = visualization.top_words(s).index[:10].values + description["most common words"] = visualization.top_words(s).index[:10].tolist() description["most common words excluding stopwords"] = ( - s.pipe(clean).pipe(visualization.top_words).index[:10].values + s.pipe(clean).pipe(visualization.top_words).index[:10].tolist() ) description["average document length"] = document_lengths_description["mean"] From 5aaa57921a7f51fcc716f49022a0d3e553f03379 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Sun, 6 Sep 2020 20:59:29 +0200 Subject: [PATCH 16/22] added right black version --- .travis.yml | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index d6103b02..3f86e7f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 From 4d398a053f4739354b367fcc456459aaba0179b1 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Sun, 6 Sep 2020 20:59:40 +0200 Subject: [PATCH 17/22] added test and formatting --- tests/test_visualization.py | 11 ++++ texthero/visualization.py | 126 +++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index d0075389..a88351dd 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -79,3 +79,14 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test visualization of describe function + """ + + def test_visualisation_describe(self): + df = pd.DataFrame( + [["one two three", "here"], ["one two three", "here"]], + columns=["text", "topic"], + ) + self.assertEqual(visualization.visualize_describe(df), None) diff --git a/texthero/visualization.py b/texthero/visualization.py index 0b3893b2..b6f775c1 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -2,20 +2,24 @@ Visualize insights and statistics of a text-based Pandas DataFrame. """ +import plotly.graph_objects as go +from plotly.subplots import make_subplots +import os import pandas as pd import numpy as np import plotly.express as px +import warnings from wordcloud import WordCloud from texthero import preprocessing from texthero._types import TextSeries, InputSeries -import string from matplotlib.colors import LinearSegmentedColormap as lsg import matplotlib.pyplot as plt from collections import Counter +import string def scatterplot( @@ -304,3 +308,123 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def visualize_describe(df, text_col_name="text", labels_col_name="topic"): + + s = df[text_col_name] + s_labels = df[labels_col_name] + + # Gather data (most from hero.describe, just + # the document lengths histogram is calculated here). + s_tokenized = preprocessing.tokenize(s) + has_content_mask = preprocessing.has_content(s) + s_document_lengths = s_tokenized[has_content_mask].map(lambda x: len(x)) + + document_lengths_histogram = np.histogram(s_document_lengths.values, bins=20) + + document_lengths_histogram_df = pd.DataFrame( + { + "Document Length": np.insert(document_lengths_histogram[0], 0, 0), + "Number of Documents": document_lengths_histogram[1], + } + ) + + description = preprocessing.describe(s, s_labels) + + # Initialize Figure + fig = make_subplots( + rows=2, + cols=2, + specs=[ + [{"type": "sankey"}, {"type": "table"}], + [{"type": "scatter"}, {"type": "pie"}], + ], + column_widths=[0.7, 0.3], + ) + + # Create pie chart of label distribution if it was calculated. + if "label distribution" in description.index: + label_distribution_pie_chart_df = description.loc["label distribution"] + label_distribution_pie_chart_fig = go.Pie( + labels=label_distribution_pie_chart_df.index.tolist(), + values=label_distribution_pie_chart_df.values.flatten().tolist(), + title="Label Distributions", + ) + + # Create histogram of document lengths + document_lengths_fig = go.Scatter( + x=document_lengths_histogram_df["Number of Documents"], + y=document_lengths_histogram_df["Document Length"], + fill="tozeroy", + name="Document Length Histogram", + showlegend=False, + ) + + # Create bar charts for documents / unique / missing + number_of_duplicates = ( + description.loc["number of documents"].values[0][0] + - description.loc["number of unique documents"].values[0][0] + - description.loc["number of missing documents"].values[0][0] + ) + + schart = go.Sankey( + node=dict( + pad=15, + thickness=20, + label=[ + "Total Number of Documents", + "Missing Documents", + "Unique Documents", + "Duplicate Documents", + ], + color=[ + "rgba(122,122,255,0.8)", + "rgba(255,153,51,0.8)", + "rgba(141,211,199,0.8)", + "rgba(235,83,83,0.8)", + ], + ), + link=dict( + # indices correspond to labels, eg A1, A2, A2, B1, ... + source=[0, 0, 0], + target=[2, 1, 3], + color=[ + "rgba(179,226,205,0.6)", + "rgba(250,201,152,0.6)", + "rgba(255,134,134,0.6)", + ], + value=[ + description.loc["number of unique documents"].values[0][0], + number_of_duplicates, + description.loc["number of missing documents"].values[0][0], + ], + ), + ) + + # Create Table to show the 10 most common words (with and without stopwords) + table = go.Table( + header=dict(values=["Top Words with Stopwords", "Top Words without Stopwords"]), + cells=dict( + values=[ + description.loc["most common words"].values[0][0], + description.loc["most common words excluding stopwords"].values[0][0], + ] + ), + ) + + # Combine figures. + fig.add_trace(label_distribution_pie_chart_fig, row=2, col=2) + fig.add_trace(document_lengths_fig, row=2, col=1) + + fig.add_trace(schart, row=1, col=1) + + fig.add_trace(table, row=1, col=2) + + # Style and show figure. + fig.update_layout(plot_bgcolor="rgb(255,255,255)", barmode="stack") + fig.update_xaxes(title_text="Document Length", row=2, col=1) + fig.update_yaxes(title_text="Number of Documents", row=2, col=1) + fig.update_layout(legend=dict(yanchor="bottom", y=0, x=1.1, xanchor="right",)) + + fig.show() From aa3aa569ef7b6a8ca178f6178dc3c1026dceb285 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Sun, 6 Sep 2020 21:07:22 +0200 Subject: [PATCH 18/22] added correct order --- texthero/visualization.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index b6f775c1..9c5faecd 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -374,9 +374,10 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): thickness=20, label=[ "Total Number of Documents", - "Missing Documents", - "Unique Documents", "Duplicate Documents", + "Unique Documents", + "Missing Documents", + ], color=[ "rgba(122,122,255,0.8)", From ea5c64072c9f794f569febbd79a9977aefc7c848 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Sun, 6 Sep 2020 20:59:40 +0200 Subject: [PATCH 19/22] added test and formatting: Co-authored-by: Henri Froese <hf2000510@gmail.com> --- tests/test_visualization.py | 11 ++++ texthero/visualization.py | 126 +++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index d0075389..a88351dd 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -79,3 +79,14 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test visualization of describe function + """ + + def test_visualisation_describe(self): + df = pd.DataFrame( + [["one two three", "here"], ["one two three", "here"]], + columns=["text", "topic"], + ) + self.assertEqual(visualization.visualize_describe(df), None) diff --git a/texthero/visualization.py b/texthero/visualization.py index 0b3893b2..b6f775c1 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -2,20 +2,24 @@ Visualize insights and statistics of a text-based Pandas DataFrame. """ +import plotly.graph_objects as go +from plotly.subplots import make_subplots +import os import pandas as pd import numpy as np import plotly.express as px +import warnings from wordcloud import WordCloud from texthero import preprocessing from texthero._types import TextSeries, InputSeries -import string from matplotlib.colors import LinearSegmentedColormap as lsg import matplotlib.pyplot as plt from collections import Counter +import string def scatterplot( @@ -304,3 +308,123 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def visualize_describe(df, text_col_name="text", labels_col_name="topic"): + + s = df[text_col_name] + s_labels = df[labels_col_name] + + # Gather data (most from hero.describe, just + # the document lengths histogram is calculated here). + s_tokenized = preprocessing.tokenize(s) + has_content_mask = preprocessing.has_content(s) + s_document_lengths = s_tokenized[has_content_mask].map(lambda x: len(x)) + + document_lengths_histogram = np.histogram(s_document_lengths.values, bins=20) + + document_lengths_histogram_df = pd.DataFrame( + { + "Document Length": np.insert(document_lengths_histogram[0], 0, 0), + "Number of Documents": document_lengths_histogram[1], + } + ) + + description = preprocessing.describe(s, s_labels) + + # Initialize Figure + fig = make_subplots( + rows=2, + cols=2, + specs=[ + [{"type": "sankey"}, {"type": "table"}], + [{"type": "scatter"}, {"type": "pie"}], + ], + column_widths=[0.7, 0.3], + ) + + # Create pie chart of label distribution if it was calculated. + if "label distribution" in description.index: + label_distribution_pie_chart_df = description.loc["label distribution"] + label_distribution_pie_chart_fig = go.Pie( + labels=label_distribution_pie_chart_df.index.tolist(), + values=label_distribution_pie_chart_df.values.flatten().tolist(), + title="Label Distributions", + ) + + # Create histogram of document lengths + document_lengths_fig = go.Scatter( + x=document_lengths_histogram_df["Number of Documents"], + y=document_lengths_histogram_df["Document Length"], + fill="tozeroy", + name="Document Length Histogram", + showlegend=False, + ) + + # Create bar charts for documents / unique / missing + number_of_duplicates = ( + description.loc["number of documents"].values[0][0] + - description.loc["number of unique documents"].values[0][0] + - description.loc["number of missing documents"].values[0][0] + ) + + schart = go.Sankey( + node=dict( + pad=15, + thickness=20, + label=[ + "Total Number of Documents", + "Missing Documents", + "Unique Documents", + "Duplicate Documents", + ], + color=[ + "rgba(122,122,255,0.8)", + "rgba(255,153,51,0.8)", + "rgba(141,211,199,0.8)", + "rgba(235,83,83,0.8)", + ], + ), + link=dict( + # indices correspond to labels, eg A1, A2, A2, B1, ... + source=[0, 0, 0], + target=[2, 1, 3], + color=[ + "rgba(179,226,205,0.6)", + "rgba(250,201,152,0.6)", + "rgba(255,134,134,0.6)", + ], + value=[ + description.loc["number of unique documents"].values[0][0], + number_of_duplicates, + description.loc["number of missing documents"].values[0][0], + ], + ), + ) + + # Create Table to show the 10 most common words (with and without stopwords) + table = go.Table( + header=dict(values=["Top Words with Stopwords", "Top Words without Stopwords"]), + cells=dict( + values=[ + description.loc["most common words"].values[0][0], + description.loc["most common words excluding stopwords"].values[0][0], + ] + ), + ) + + # Combine figures. + fig.add_trace(label_distribution_pie_chart_fig, row=2, col=2) + fig.add_trace(document_lengths_fig, row=2, col=1) + + fig.add_trace(schart, row=1, col=1) + + fig.add_trace(table, row=1, col=2) + + # Style and show figure. + fig.update_layout(plot_bgcolor="rgb(255,255,255)", barmode="stack") + fig.update_xaxes(title_text="Document Length", row=2, col=1) + fig.update_yaxes(title_text="Number of Documents", row=2, col=1) + fig.update_layout(legend=dict(yanchor="bottom", y=0, x=1.1, xanchor="right",)) + + fig.show() From d72128f2089f4eb24476aac8433a9b7368b6a1ed Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Sun, 6 Sep 2020 21:07:22 +0200 Subject: [PATCH 20/22] added correct order --- texthero/visualization.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index b6f775c1..9c5faecd 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -374,9 +374,10 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): thickness=20, label=[ "Total Number of Documents", - "Missing Documents", - "Unique Documents", "Duplicate Documents", + "Unique Documents", + "Missing Documents", + ], color=[ "rgba(122,122,255,0.8)", From 8cd4a1b6e4f467d96cd2f0d455785b1d17614e17 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn <maximilian.krahn@icloud.com> Date: Sun, 6 Sep 2020 21:46:22 +0200 Subject: [PATCH 21/22] added format --- texthero/visualization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 9c5faecd..461f6e54 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -377,7 +377,6 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): "Duplicate Documents", "Unique Documents", "Missing Documents", - ], color=[ "rgba(122,122,255,0.8)", From 4cb1058f16646a20006c3fde68f5f98bdeceea8e Mon Sep 17 00:00:00 2001 From: Henri Froese <henri.froese@yahoo.com> Date: Wed, 9 Sep 2020 22:22:16 +0200 Subject: [PATCH 22/22] Incorporate suggested changes. - Docstring - Better signature --- tests/test_visualization.py | 8 +++- texthero/visualization.py | 81 +++++++++++++++++++++++++++---------- 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index a88351dd..963263e1 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -1,6 +1,7 @@ import string import pandas as pd +import plotly import doctest from texthero import visualization @@ -89,4 +90,9 @@ def test_visualisation_describe(self): [["one two three", "here"], ["one two three", "here"]], columns=["text", "topic"], ) - self.assertEqual(visualization.visualize_describe(df), None) + self.assertIsInstance( + visualization.visualize_describe( + df["text"], df["topic"], return_figure=True + ), + plotly.graph_objs._figure.Figure, + ) diff --git a/texthero/visualization.py b/texthero/visualization.py index 461f6e54..d855af15 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -310,10 +310,38 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: ) -def visualize_describe(df, text_col_name="text", labels_col_name="topic"): +def visualize_describe(s: TextSeries, s_labels: pd.Series = None, return_figure=False): + """ + Visualize statistics about a given TextSeries, and + optionally a given Series with labels/classes. + + This function visualizes the output of + :meth:`texthero.preprocessing.describe`. + + Parameters + ---------- + s: TextSeries + The Series that should be described. + + s_labels : pd.Series + A Series with the labels / classes / topics + of the texts in the first argument. - s = df[text_col_name] - s_labels = df[labels_col_name] + return_figure : bool, default to False + Whether to return the figure instead of showing it. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") # doctest: +SKIP + >>> df.head(2) # doctest: +SKIP + text topic + 0 Claxton hunting first major medal\n\nBritish h... athletics + 1 O'Sullivan could run in Worlds\n\nSonia O'Sull... athletics + >>> # Describe both the text and the labels + >>> hero.visualize_describe(df["text"], df["topic"]) # doctest: +SKIP + """ # Gather data (most from hero.describe, just # the document lengths histogram is calculated here). @@ -351,6 +379,8 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): values=label_distribution_pie_chart_df.values.flatten().tolist(), title="Label Distributions", ) + else: + label_distribution_pie_chart_fig = None # Create histogram of document lengths document_lengths_fig = go.Scatter( @@ -361,12 +391,25 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): showlegend=False, ) + if s_labels is not None: # labels given -> description output is multiindexed + n_total_docs = description.loc["number of documents"].values[0][0] + n_unique_docs = description.loc["number of unique documents"].values[0][0] + n_missing_docs = description.loc["number of missing documents"].values[0][0] + most_common_words = description.loc["most common words"].values[0][0] + most_common_words_excluding_stopwords = description.loc[ + "most common words excluding stopwords" + ].values[0][0] + else: + n_total_docs = description.loc["number of documents"].values[0] + n_unique_docs = description.loc["number of unique documents"].values[0] + n_missing_docs = description.loc["number of missing documents"].values[0] + most_common_words = description.loc["most common words"].values[0] + most_common_words_excluding_stopwords = description.loc[ + "most common words excluding stopwords" + ].values[0] + # Create bar charts for documents / unique / missing - number_of_duplicates = ( - description.loc["number of documents"].values[0][0] - - description.loc["number of unique documents"].values[0][0] - - description.loc["number of missing documents"].values[0][0] - ) + n_duplicate_docs = n_total_docs - n_unique_docs - n_missing_docs schart = go.Sankey( node=dict( @@ -394,27 +437,20 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): "rgba(250,201,152,0.6)", "rgba(255,134,134,0.6)", ], - value=[ - description.loc["number of unique documents"].values[0][0], - number_of_duplicates, - description.loc["number of missing documents"].values[0][0], - ], + value=[n_unique_docs, n_duplicate_docs, n_missing_docs,], ), ) # Create Table to show the 10 most common words (with and without stopwords) table = go.Table( header=dict(values=["Top Words with Stopwords", "Top Words without Stopwords"]), - cells=dict( - values=[ - description.loc["most common words"].values[0][0], - description.loc["most common words excluding stopwords"].values[0][0], - ] - ), + cells=dict(values=[most_common_words, most_common_words_excluding_stopwords,]), ) # Combine figures. - fig.add_trace(label_distribution_pie_chart_fig, row=2, col=2) + if label_distribution_pie_chart_fig is not None: + fig.add_trace(label_distribution_pie_chart_fig, row=2, col=2) + fig.add_trace(document_lengths_fig, row=2, col=1) fig.add_trace(schart, row=1, col=1) @@ -427,4 +463,7 @@ def visualize_describe(df, text_col_name="text", labels_col_name="topic"): fig.update_yaxes(title_text="Number of Documents", row=2, col=1) fig.update_layout(legend=dict(yanchor="bottom", y=0, x=1.1, xanchor="right",)) - fig.show() + if return_figure: + return fig + else: + fig.show()