Skip to content

Commit

Permalink
v0.3 (#17)
Browse files Browse the repository at this point in the history
* Extract multiple best matches
* Add top_n to TF-IDF and Embeddings
* Update documentation and prepare for release
  • Loading branch information
MaartenGr authored Apr 30, 2021
1 parent fbe0cbb commit a60dfc6
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 25 deletions.
3 changes: 3 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
v0.3.0
- Use `top_n` in `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings`

v0.2.2
- Update grouping to include all strings only if identical lists of strings are compared

Expand Down
2 changes: 1 addition & 1 deletion polyfuzz/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .polyfuzz import PolyFuzz
__version__ = "0.2.2"
__version__ = "0.3.0"
4 changes: 3 additions & 1 deletion polyfuzz/models/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ def __init__(self, model_id: str = "Model 0"):
self.type = "Base Model"

@abstractmethod
def match(self, from_list: List[str], to_list: List[str]) -> pd.DataFrame:
def match(self,
from_list: List[str],
to_list: List[str]) -> pd.DataFrame:
""" Make sure you follow the same argument structure:
Arguments:
Expand Down
7 changes: 6 additions & 1 deletion polyfuzz/models/_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Embeddings(BaseMatcher):
Arguments:
embedding_method: list of Flair embeddings to use
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
top_n: The number of best matches you want returned
cosine_method: The method/package for calculating the cosine similarity.
Options: "sparse", "sklearn", "knn".
Sparse is the fastest and most memory efficient but requires a
Expand Down Expand Up @@ -59,6 +60,7 @@ class Embeddings(BaseMatcher):
def __init__(self,
embedding_method: Union[List, None] = None,
min_similarity: float = 0.75,
top_n: int = 1,
cosine_method: str = "sparse",
model_id: str = None):
super().__init__(model_id)
Expand All @@ -77,6 +79,7 @@ def __init__(self,
self.document_embeddings = embedding_method

self.min_similarity = min_similarity
self.top_n = top_n
self.cosine_method = cosine_method

def match(self,
Expand Down Expand Up @@ -110,7 +113,9 @@ def match(self,

matches = cosine_similarity(embeddings_from, embeddings_to,
from_list, to_list,
self.min_similarity, self.cosine_method)
self.min_similarity,
top_n=self.top_n,
method=self.cosine_method)

return matches

Expand Down
7 changes: 6 additions & 1 deletion polyfuzz/models/_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class TFIDF(BaseMatcher):
n_gram_range: The n_gram_range on a character-level
clean_string: Whether to clean the string such that only alphanumerical characters are kept
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
top_n: The number of matches you want returned
cosine_method: The method/package for calculating the cosine similarity.
Options:
* sparse
Expand Down Expand Up @@ -48,6 +49,7 @@ def __init__(self,
n_gram_range: Tuple[int, int] = (3, 3),
clean_string: bool = True,
min_similarity: float = 0.75,
top_n: int = 1,
cosine_method: str = "sparse",
model_id: str = None):
super().__init__(model_id)
Expand All @@ -56,6 +58,7 @@ def __init__(self,
self.clean_string = clean_string
self.min_similarity = min_similarity
self.cosine_method = cosine_method
self.top_n = top_n

def match(self,
from_list: List[str],
Expand All @@ -82,7 +85,9 @@ def match(self,
tf_idf_from, tf_idf_to = self._extract_tf_idf(from_list, to_list)
matches = cosine_similarity(tf_idf_from, tf_idf_to,
from_list, to_list,
self.min_similarity, self.cosine_method)
self.min_similarity,
top_n=self.top_n,
method=self.cosine_method)

return matches

Expand Down
48 changes: 33 additions & 15 deletions polyfuzz/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def cosine_similarity(from_vector: np.ndarray,
from_list: List[str],
to_list: List[str],
min_similarity: float = 0.75,
top_n: int = 1,
method: str = "sparse") -> pd.DataFrame:
""" Calculate similarity between two matrices/vectors and return best matches
Expand All @@ -26,6 +27,7 @@ def cosine_similarity(from_vector: np.ndarray,
from_list: The list from which you want mappings
to_list: The list where you want to map to
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
top_n: The number of best matches you want returned
method: The method/package for calculating the cosine similarity.
Options: "sparse", "sklearn", "knn".
Sparse is the fastest and most memory efficient but requires a
Expand All @@ -49,20 +51,22 @@ def cosine_similarity(from_vector: np.ndarray,
indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
```
"""
if top_n > len(set(to_list)):
top_n = len(set(to_list))

# Slower but uses less memory
if method == "knn":

if from_list == to_list:
knn = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='cosine').fit(to_vector)
knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector)
distances, indices = knn.kneighbors(from_vector)
distances = distances[:, 1]
indices = indices[:, 1]

distances = distances[:, 1:]
indices = indices[:, 1:]
else:
knn = NearestNeighbors(n_neighbors=1, n_jobs=-1, metric='cosine').fit(to_vector)
knn = NearestNeighbors(n_neighbors=top_n, n_jobs=-1, metric='cosine').fit(to_vector)
distances, indices = knn.kneighbors(from_vector)

similarity = [round(1 - distance, 3) for distance in distances.flatten()]
similarities = [np.round(1 - distances[:, i], 3) for i in range(distances.shape[1])]

# Fast, but does has some installation issues
elif _HAVE_SPARSE_DOT and method == "sparse":
Expand All @@ -74,15 +78,16 @@ def cosine_similarity(from_vector: np.ndarray,
# There is a bug with awesome_cossim_topn that when to_vector and from_vector
# have the same shape, setting topn to 1 does not work. Apparently, you need
# to it at least to 2 for it to work
similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, 2, min_similarity)
similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity)

if from_list == to_list:
similarity_matrix = similarity_matrix.tolil()
similarity_matrix.setdiag(0.)
similarity_matrix = similarity_matrix.tocsr()

indices = np.array(similarity_matrix.argmax(axis=1).T).flatten()
similarity = similarity_matrix.max(axis=1).toarray().T.flatten()
indices = np.flip(np.argsort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
similarities = np.flip(np.sort(similarity_matrix.toarray(), axis=-1), axis=1)[:, :top_n]
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]

# Faster than knn and slower than sparse but uses more memory
else:
Expand All @@ -91,13 +96,26 @@ def cosine_similarity(from_vector: np.ndarray,
if from_list == to_list:
np.fill_diagonal(similarity_matrix, 0)

indices = similarity_matrix.argmax(axis=1)
similarity = similarity_matrix.max(axis=1)
indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
similarities = np.flip(np.sort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]

# Convert results to df
matches = [to_list[idx] for idx in indices.flatten()]
matches = pd.DataFrame(np.vstack((from_list, matches, similarity)).T, columns=["From", "To", "Similarity"])
matches.Similarity = matches.Similarity.astype(float)
matches.loc[matches.Similarity < 0.001, "To"] = None
columns = (["From"] +
["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)]); columns
matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])]
matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns)

# Update column order
columns = [["From", "To", "Similarity"]] + [[f"To_{i+2}", f"Similarity_{i+2}"] for i in range((top_n-1))]
matches = matches.loc[:, [title for column in columns for title in column]]

# Update types
for column in matches.columns:
if "Similarity" in column:
matches[column] = matches[column].astype(float)
matches.loc[matches[column] < 0.001, column] = float(0)
matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None

return matches
12 changes: 8 additions & 4 deletions polyfuzz/polyfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,17 @@ def __init__(self,

def match(self,
from_list: List[str],
to_list: List[str]):
to_list: List[str],
top_n: int = 1):
""" Match the from_list of strings to the to_list of strings with whatever models
you have initialized
Arguments:
from_list: The list from which you want mappings
to_list: The list where you want to map to
top_n: The number of matches you want returned. This is currently only implemented
for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
can computationally handle more comparisons.
Updates:
self.matches: A dictionary with the matches from all models, can
Expand All @@ -115,11 +119,11 @@ def match(self,
# Standard models - quick access
if isinstance(self.method, str):
if self.method in ["TF-IDF", "TFIDF"]:
self.matches = {"TF-IDF": TFIDF(min_similarity=0).match(from_list, to_list)}
self.matches = {"TF-IDF": TFIDF(min_similarity=0, top_n=top_n).match(from_list, to_list)}
elif self.method in ["EditDistance", "Edit Distance"]:
self.matches = {"EditDistance": RapidFuzz().match(from_list, to_list)}
elif self.method in ["Embeddings", "Embedding"]:
self.matches = {"Embeddings": Embeddings(min_similarity=0).match(from_list, to_list)}
self.matches = {"Embeddings": Embeddings(min_similarity=0, top_n=top_n).match(from_list, to_list)}
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
Expand Down Expand Up @@ -242,7 +246,7 @@ def get_ids(self) -> Union[str, List[str], None]:
return None

def get_matches(self, model_id: str = None) -> Union[pd.DataFrame,
Mapping[str, pd.DataFrame]]:
Mapping[str, pd.DataFrame]]:
""" Get the matches from one or more models"""
check_matches(self)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
setup(
name="polyfuzz",
packages=find_packages(exclude=["notebooks", "docs"]),
version="0.2.2",
version="0.3.0",
author="Maarten Grootendorst",
author_email="[email protected]",
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ def test_linkage(min_similarity):
assert max(cluster_mapping.values()) == 1
assert len(cluster_name_map) == 2

else:
elif min_similarity >= 0.6:
assert max(cluster_mapping.values()) > 1
assert len(cluster_name_map) == 3

0 comments on commit a60dfc6

Please sign in to comment.