v0.3.4 (#30)

* Fix duplicate lists not mapping
MaartenGr · Nov 5, 2021 · 241d7d3 · 241d7d3
1 parent 672c90e
commit 241d7d3
Show file tree

Hide file tree

Showing 13 changed files with 69 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -98,7 +98,9 @@ The resulting matches can be accessed through `model.get_matches()`:
 
 ``` 
 
-**NOTE**: When instantiating `PolyFuzz` we also could have used "EditDistance" or "Embeddings" to quickly 
+**NOTE 1**: If you want to compare distances within a single list, you can simply pass that list as such: `model.match(from_list)`
+
+**NOTE 2**: When instantiating `PolyFuzz` we also could have used "EditDistance" or "Embeddings" to quickly 
 access Levenshtein and FastText (English) respectively. 
 
 ### Group Matches

diff --git a/docs/releases.md b/docs/releases.md
@@ -1,3 +1,27 @@
+v0.3.4    
+- Make sure that when you use two lists that are exactly the same, it will return 1 for identical terms:
+
+```python
+from polyfuzz import PolyFuzz
+from_list = ["apple", "house"]
+model = PolyFuzz("TF-IDF")
+model.match(from_list, from_list)
+```
+
+This will match each word in `from_list` to itself and give it a score of 1. Thus, `apple` will be matched to `apple` and 
+`house` will be mapped to `house`. However, if you input just a single list, it will try to map them within the list without 
+mapping to itself:
+
+```python
+from polyfuzz import PolyFuzz
+from_list = ["apple", "apples"]
+model = PolyFuzz("TF-IDF")
+model.match(from_list)
+```
+
+In the example above, `apple` will be mapped to `apples` and not to `apple`. Here, we assume that the user wants to 
+find the most similar words within a list without mapping to itself. 
+
 v0.3.3  
 - Update numpy to "numpy>=1.20.0" to prevent [this](https://github.com/MaartenGr/PolyFuzz/issues/23) and this [issue](https://github.com/MaartenGr/PolyFuzz/issues/21)
 - Update pytorch to "torch>=1.4.0,<1.7.1" to prevent save_state_warning error   

diff --git a/docs/tutorial/datasets/datasets.md b/docs/tutorial/datasets/datasets.md
@@ -1,10 +1,12 @@
 # Datasets
-There are two datasets prepared for you to play around with:
-* Company Names
+There are two datasets prepared for you to play around with:  
+
+* Company Names  
 * Movie Titles
 
 ## Movie Titles
-This data is retrieved from:  
+This data is retrieved from:    
+
 * https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset  
 * https://www.kaggle.com/shivamb/netflix-shows  
 
@@ -22,7 +24,7 @@ model = PolyFuzz("TF-IDF").match(data["Netflix"], data["IMDB"])
 ```
 
 ## Company Names
-This data is retrieved from https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv 
+This data is retrieved from [here](https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv) 
 and contains 100_000 company names to be matched against each other. 
 
 This is a different use case than what you have typically seen so far. We often see two different lists compared 
@@ -36,8 +38,8 @@ from polyfuzz import PolyFuzz
 from polyfuzz.datasets import load_company_names
 
 data = load_company_names()
-model = PolyFuzz("TF-IDF").match(data, data)
+model = PolyFuzz("TF-IDF").match(data)
 ```
 
-PolyFuzz will recognize that the lists are similar and that you are looking to match the titles with themselves. 
+By only inserting a single list, PolyFuzz will recognize that you are looking to match the titles with themselves. 
 It will ignore any comparison a string has with itself, otherwise everything will get mapped to itself. 
diff --git a/notebooks/Overview.ipynb b/notebooks/Overview.ipynb
@@ -600,7 +600,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -614,7 +614,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,

diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py
@@ -1,2 +1,2 @@
 from .polyfuzz import PolyFuzz
-__version__ = "0.3.3"
+__version__ = "0.3.4"
diff --git a/polyfuzz/models/_base.py b/polyfuzz/models/_base.py
@@ -13,7 +13,7 @@ def __init__(self, model_id: str = "Model 0"):
     @abstractmethod
     def match(self,
               from_list: List[str],
-              to_list: List[str]) -> pd.DataFrame:
+              to_list: List[str] = None) -> pd.DataFrame:
         """ Make sure you follow the same argument structure:
 
         Arguments:

diff --git a/polyfuzz/models/_distance.py b/polyfuzz/models/_distance.py
@@ -45,7 +45,7 @@ def __init__(self,
 
     def match(self,
               from_list: List[str],
-              to_list: List[str]) -> pd.DataFrame:
+              to_list: List[str] = None) -> pd.DataFrame:
         """ Calculate the edit distances between two list of strings
         by parallelizing the calculation and passing the lists in
         batches.
@@ -66,9 +66,10 @@ def match(self,
                               ["string_three", "string_four"])
         ```
         """
-        if from_list == to_list:
+        if to_list is None:
             self.equal_lists = True
             expected_iterations = int(len(from_list)/2)
+            to_list = from_list.copy()
         else:
             expected_iterations = len(from_list)
 

diff --git a/polyfuzz/models/_embeddings.py b/polyfuzz/models/_embeddings.py
@@ -84,7 +84,7 @@ def __init__(self,
 
     def match(self,
               from_list: List[str],
-              to_list: List[str],
+              to_list: List[str] = None,
               embeddings_from: np.ndarray = None,
               embeddings_to: np.ndarray = None) -> pd.DataFrame:
         """ Matches the two lists of strings to each other and returns the best mapping
@@ -109,7 +109,10 @@ def match(self,
         if not isinstance(embeddings_from, np.ndarray):
             embeddings_from = self._embed(from_list)
         if not isinstance(embeddings_to, np.ndarray):
-            embeddings_to = self._embed(to_list)
+            if to_list is None:
+                embeddings_to = self._embed(from_list)
+            else:
+                embeddings_to = self._embed(to_list)
 
         matches = cosine_similarity(embeddings_from, embeddings_to,
                                     from_list, to_list,

diff --git a/polyfuzz/models/_rapidfuzz.py b/polyfuzz/models/_rapidfuzz.py
@@ -60,7 +60,7 @@ def __init__(self,
 
     def match(self,
               from_list: List[str],
-              to_list: List[str]) -> pd.DataFrame:
+              to_list: List[str] = None) -> pd.DataFrame:
         """ Calculate the edit distances between two list of strings
         by parallelizing the calculation and passing the lists in
         batches.
@@ -81,9 +81,10 @@ def match(self,
                               ["string_three", "string_four"])
         ```
         """
-        if from_list == to_list:
+        if to_list is None:
             self.equal_lists = True
             expected_iterations = int(len(from_list)/2)
+            to_list = from_list.copy()
         else:
             expected_iterations = len(from_list)
 

diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py
@@ -62,7 +62,7 @@ def __init__(self,
 
     def match(self,
               from_list: List[str],
-              to_list: List[str]) -> pd.DataFrame:
+              to_list: List[str] = None) -> pd.DataFrame:
         """ Match two lists of strings to each other and return the most similar strings
 
         Arguments:
@@ -101,7 +101,7 @@ def _extract_tf_idf(self,
             tf_idf_from = vectorizer.transform(from_list)
         else:
             tf_idf_to = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit_transform(from_list)
-            tf_idf_from = None
+            tf_idf_from = tf_idf_to
 
         return tf_idf_from, tf_idf_to
 

diff --git a/polyfuzz/models/_utils.py b/polyfuzz/models/_utils.py
@@ -51,13 +51,14 @@ def cosine_similarity(from_vector: np.ndarray,
     indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
     ```
     """
-    if top_n > len(set(to_list)):
-        top_n = len(set(to_list))
-
+    if to_list is not None:
+        if top_n > len(set(to_list)):
+            top_n = len(set(to_list))
+
     # Slower but uses less memory
     if method == "knn":
 
-        if from_list == to_list:
+        if to_list is None:
             knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector)
             distances, indices = knn.kneighbors(from_vector)
             distances = distances[:, 1:]
@@ -80,7 +81,7 @@ def cosine_similarity(from_vector: np.ndarray,
         # to it at least to 2 for it to work
         similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity)
 
-        if from_list == to_list:
+        if to_list is None:
             similarity_matrix = similarity_matrix.tolil()
             similarity_matrix.setdiag(0.)
             similarity_matrix = similarity_matrix.tocsr()
@@ -93,14 +94,17 @@ def cosine_similarity(from_vector: np.ndarray,
     else:
         similarity_matrix = scikit_cosine_similarity(from_vector, to_vector)
 
-        if from_list == to_list:
+        if to_list is None:
             np.fill_diagonal(similarity_matrix, 0)
 
         indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
         similarities = np.flip(np.sort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
         similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]
 
     # Convert results to df
+    if to_list is None:
+        to_list = from_list.copy()
+
     columns = (["From"] +
                ["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
                ["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)])

diff --git a/polyfuzz/polyfuzz.py b/polyfuzz/polyfuzz.py
@@ -85,13 +85,16 @@ def __init__(self,
 
     def match(self,
               from_list: List[str],
-              to_list: List[str],
+              to_list: List[str] = None,
               top_n: int = 1):
         """ Match the from_list of strings to the to_list of strings with whatever models
         you have initialized
 
         Arguments:
-            from_list: The list from which you want mappings
+            from_list: The list from which you want mappings.
+                       If you want to map items within a list, and not map the 
+                       items to themselves, you can supply only the `from_list` and 
+                       ignore the `to_list`. 
             to_list: The list where you want to map to
             top_n: The number of matches you want returned. This is currently only implemented
                    for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
@@ -304,7 +307,7 @@ def _create_groups(self,
             strings = list(self.matches[name].To.dropna().unique())
 
         # Create clusters
-        matches = model.match(strings, strings)
+        matches = model.match(strings)
         clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
 
         # Map the `to` list to groups

diff --git a/setup.py b/setup.py
@@ -37,7 +37,7 @@
 setup(
     name="polyfuzz",
     packages=find_packages(exclude=["notebooks", "docs"]),
-    version="0.3.3",
+    version="0.3.4",
     author="Maarten Grootendorst",
     author_email="[email protected]",
     description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",