diff --git a/README.md b/README.md index 9624e86..0024465 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,9 @@ The resulting matches can be accessed through `model.get_matches()`: ``` -**NOTE**: When instantiating `PolyFuzz` we also could have used "EditDistance" or "Embeddings" to quickly +**NOTE 1**: If you want to compare distances within a single list, you can simply pass that list as such: `model.match(from_list)` + +**NOTE 2**: When instantiating `PolyFuzz` we also could have used "EditDistance" or "Embeddings" to quickly access Levenshtein and FastText (English) respectively. ### Group Matches diff --git a/docs/releases.md b/docs/releases.md index 2f096c2..ae96288 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,3 +1,27 @@ +v0.3.4 +- Make sure that when you use two lists that are exactly the same, it will return 1 for identical terms: + +```python +from polyfuzz import PolyFuzz +from_list = ["apple", "house"] +model = PolyFuzz("TF-IDF") +model.match(from_list, from_list) +``` + +This will match each word in `from_list` to itself and give it a score of 1. Thus, `apple` will be matched to `apple` and +`house` will be mapped to `house`. However, if you input just a single list, it will try to map them within the list without +mapping to itself: + +```python +from polyfuzz import PolyFuzz +from_list = ["apple", "apples"] +model = PolyFuzz("TF-IDF") +model.match(from_list) +``` + +In the example above, `apple` will be mapped to `apples` and not to `apple`. Here, we assume that the user wants to +find the most similar words within a list without mapping to itself. + v0.3.3 - Update numpy to "numpy>=1.20.0" to prevent [this](https://github.com/MaartenGr/PolyFuzz/issues/23) and this [issue](https://github.com/MaartenGr/PolyFuzz/issues/21) - Update pytorch to "torch>=1.4.0,<1.7.1" to prevent save_state_warning error diff --git a/docs/tutorial/datasets/datasets.md b/docs/tutorial/datasets/datasets.md index 7ea228b..56a87de 100644 --- a/docs/tutorial/datasets/datasets.md +++ b/docs/tutorial/datasets/datasets.md @@ -1,10 +1,12 @@ # Datasets -There are two datasets prepared for you to play around with: -* Company Names +There are two datasets prepared for you to play around with: + +* Company Names * Movie Titles ## Movie Titles -This data is retrieved from: +This data is retrieved from: + * https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset * https://www.kaggle.com/shivamb/netflix-shows @@ -22,7 +24,7 @@ model = PolyFuzz("TF-IDF").match(data["Netflix"], data["IMDB"]) ``` ## Company Names -This data is retrieved from https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv +This data is retrieved from [here](https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv) and contains 100_000 company names to be matched against each other. This is a different use case than what you have typically seen so far. We often see two different lists compared @@ -36,8 +38,8 @@ from polyfuzz import PolyFuzz from polyfuzz.datasets import load_company_names data = load_company_names() -model = PolyFuzz("TF-IDF").match(data, data) +model = PolyFuzz("TF-IDF").match(data) ``` -PolyFuzz will recognize that the lists are similar and that you are looking to match the titles with themselves. +By only inserting a single list, PolyFuzz will recognize that you are looking to match the titles with themselves. It will ignore any comparison a string has with itself, otherwise everything will get mapped to itself. diff --git a/notebooks/Overview.ipynb b/notebooks/Overview.ipynb index 2045f2d..2b84745 100644 --- a/notebooks/Overview.ipynb +++ b/notebooks/Overview.ipynb @@ -600,7 +600,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -614,7 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py index 95d1811..d587adc 100644 --- a/polyfuzz/__init__.py +++ b/polyfuzz/__init__.py @@ -1,2 +1,2 @@ from .polyfuzz import PolyFuzz -__version__ = "0.3.3" +__version__ = "0.3.4" diff --git a/polyfuzz/models/_base.py b/polyfuzz/models/_base.py index 9f6bf26..461935e 100644 --- a/polyfuzz/models/_base.py +++ b/polyfuzz/models/_base.py @@ -13,7 +13,7 @@ def __init__(self, model_id: str = "Model 0"): @abstractmethod def match(self, from_list: List[str], - to_list: List[str]) -> pd.DataFrame: + to_list: List[str] = None) -> pd.DataFrame: """ Make sure you follow the same argument structure: Arguments: diff --git a/polyfuzz/models/_distance.py b/polyfuzz/models/_distance.py index f5f389d..fd29cd1 100644 --- a/polyfuzz/models/_distance.py +++ b/polyfuzz/models/_distance.py @@ -45,7 +45,7 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str]) -> pd.DataFrame: + to_list: List[str] = None) -> pd.DataFrame: """ Calculate the edit distances between two list of strings by parallelizing the calculation and passing the lists in batches. @@ -66,9 +66,10 @@ def match(self, ["string_three", "string_four"]) ``` """ - if from_list == to_list: + if to_list is None: self.equal_lists = True expected_iterations = int(len(from_list)/2) + to_list = from_list.copy() else: expected_iterations = len(from_list) diff --git a/polyfuzz/models/_embeddings.py b/polyfuzz/models/_embeddings.py index 2da1b46..26d19b4 100644 --- a/polyfuzz/models/_embeddings.py +++ b/polyfuzz/models/_embeddings.py @@ -84,7 +84,7 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str], + to_list: List[str] = None, embeddings_from: np.ndarray = None, embeddings_to: np.ndarray = None) -> pd.DataFrame: """ Matches the two lists of strings to each other and returns the best mapping @@ -109,7 +109,10 @@ def match(self, if not isinstance(embeddings_from, np.ndarray): embeddings_from = self._embed(from_list) if not isinstance(embeddings_to, np.ndarray): - embeddings_to = self._embed(to_list) + if to_list is None: + embeddings_to = self._embed(from_list) + else: + embeddings_to = self._embed(to_list) matches = cosine_similarity(embeddings_from, embeddings_to, from_list, to_list, diff --git a/polyfuzz/models/_rapidfuzz.py b/polyfuzz/models/_rapidfuzz.py index 1e5d676..32dc784 100644 --- a/polyfuzz/models/_rapidfuzz.py +++ b/polyfuzz/models/_rapidfuzz.py @@ -60,7 +60,7 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str]) -> pd.DataFrame: + to_list: List[str] = None) -> pd.DataFrame: """ Calculate the edit distances between two list of strings by parallelizing the calculation and passing the lists in batches. @@ -81,9 +81,10 @@ def match(self, ["string_three", "string_four"]) ``` """ - if from_list == to_list: + if to_list is None: self.equal_lists = True expected_iterations = int(len(from_list)/2) + to_list = from_list.copy() else: expected_iterations = len(from_list) diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py index f0cc542..d994f81 100644 --- a/polyfuzz/models/_tfidf.py +++ b/polyfuzz/models/_tfidf.py @@ -62,7 +62,7 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str]) -> pd.DataFrame: + to_list: List[str] = None) -> pd.DataFrame: """ Match two lists of strings to each other and return the most similar strings Arguments: @@ -101,7 +101,7 @@ def _extract_tf_idf(self, tf_idf_from = vectorizer.transform(from_list) else: tf_idf_to = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit_transform(from_list) - tf_idf_from = None + tf_idf_from = tf_idf_to return tf_idf_from, tf_idf_to diff --git a/polyfuzz/models/_utils.py b/polyfuzz/models/_utils.py index 99f7d85..3ca19fb 100644 --- a/polyfuzz/models/_utils.py +++ b/polyfuzz/models/_utils.py @@ -51,13 +51,14 @@ def cosine_similarity(from_vector: np.ndarray, indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse") ``` """ - if top_n > len(set(to_list)): - top_n = len(set(to_list)) - + if to_list is not None: + if top_n > len(set(to_list)): + top_n = len(set(to_list)) + # Slower but uses less memory if method == "knn": - if from_list == to_list: + if to_list is None: knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector) distances, indices = knn.kneighbors(from_vector) distances = distances[:, 1:] @@ -80,7 +81,7 @@ def cosine_similarity(from_vector: np.ndarray, # to it at least to 2 for it to work similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity) - if from_list == to_list: + if to_list is None: similarity_matrix = similarity_matrix.tolil() similarity_matrix.setdiag(0.) similarity_matrix = similarity_matrix.tocsr() @@ -93,7 +94,7 @@ def cosine_similarity(from_vector: np.ndarray, else: similarity_matrix = scikit_cosine_similarity(from_vector, to_vector) - if from_list == to_list: + if to_list is None: np.fill_diagonal(similarity_matrix, 0) indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n] @@ -101,6 +102,9 @@ def cosine_similarity(from_vector: np.ndarray, similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])] # Convert results to df + if to_list is None: + to_list = from_list.copy() + columns = (["From"] + ["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] + ["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)]) diff --git a/polyfuzz/polyfuzz.py b/polyfuzz/polyfuzz.py index 79546d8..b0b7b59 100644 --- a/polyfuzz/polyfuzz.py +++ b/polyfuzz/polyfuzz.py @@ -85,13 +85,16 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str], + to_list: List[str] = None, top_n: int = 1): """ Match the from_list of strings to the to_list of strings with whatever models you have initialized Arguments: - from_list: The list from which you want mappings + from_list: The list from which you want mappings. + If you want to map items within a list, and not map the + items to themselves, you can supply only the `from_list` and + ignore the `to_list`. to_list: The list where you want to map to top_n: The number of matches you want returned. This is currently only implemented for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they @@ -304,7 +307,7 @@ def _create_groups(self, strings = list(self.matches[name].To.dropna().unique()) # Create clusters - matches = model.match(strings, strings) + matches = model.match(strings) clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity) # Map the `to` list to groups diff --git a/setup.py b/setup.py index 425c759..8af809d 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup( name="polyfuzz", packages=find_packages(exclude=["notebooks", "docs"]), - version="0.3.3", + version="0.3.4", author="Maarten Grootendorst", author_email="maartengrootendorst@gmail.com", description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",