diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 42f19a2..2097ec1 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -25,6 +25,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install -e ".[dev, sbert]" - name: Run Checking Mechanisms run: make check diff --git a/.gitignore b/.gitignore index 46b5c1a..d2a29a1 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,6 @@ venv.bak/ .idea .idea/ + +# For quick testing +/Untitled.ipynb diff --git a/README.md b/README.md index 0024465..b01bebf 100644 --- a/README.md +++ b/README.md @@ -22,24 +22,21 @@ You can install **`PolyFuzz`** via pip: pip install polyfuzz ``` -This will install the base dependencies. If you want to speed -up the cosine similarity comparison and decrease memory usage, -you can use `sparse_dot_topn` which is installed via: +You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: -```bash -pip install polyfuzz[fast] -``` - -If you want to be making use of 🤗 Transformers, install the additional additional `Flair` dependency: - -```bash -pip install polyfuzz[flair] +```python +pip install bertopic[sbert] +pip install bertopic[flair] +pip install bertopic[gensim] +pip install bertopic[spacy] +pip install bertopic[use] ``` -To install all the additional dependencies: +If you want to speed up the cosine similarity comparison and decrease memory usage when using embedding models, +you can use `sparse_dot_topn` which is installed via: ```bash -pip install polyfuzz[all] +pip install polyfuzz[fast] ```
@@ -103,6 +100,42 @@ The resulting matches can be accessed through `model.get_matches()`: **NOTE 2**: When instantiating `PolyFuzz` we also could have used "EditDistance" or "Embeddings" to quickly access Levenshtein and FastText (English) respectively. +### Production +The `.match` function allows you to quickly extract similar strings. However, after selecting the right models to be used, you may want to use PolyFuzz +in production to match incoming strings. To do so, we can make use of the familiar `fit`, `transform`, and `fit_transform` functions. + +Let's say that we have a list of words that we know to be correct called `train_words`. We want to any incoming word to mapped to one of the words in `train_words`. +In other words, we `fit` on `train_words` and we use `transform` on any incoming words: + +```python +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import CountVectorizer +from polyfuzz import PolyFuzz + +train_words = ["apple", "apples", "appl", "recal", "house", "similarity"] +unseen_words = ["apple", "apples", "mouse"] + +# Fit +model = PolyFuzz("TF-IDF") +model.fit(train_words) + +# Transform +results = model.transform(unseen_words) +``` + +In the above example, we are using `fit` on `train_words` to calculate the TF-IDF representation of those words which are saved to be used again in `transform`. +This speeds up `transform` quite a bit since all TF-IDF representations are stored when applying `fit`. + +Then, we apply save and load the model as follows to be used in production: + +```python +# Save the model +model.save("my_model") + +# Load the model +loaded_model = PolyFuzz.load("my_model") +``` + ### Group Matches We can group the matches `To` as there might be significant overlap in strings in our to_list. To do this, we calculate the similarity within strings in to_list and use `single linkage` to then @@ -214,7 +247,7 @@ from polyfuzz.models import BaseMatcher class MyModel(BaseMatcher): - def match(self, from_list, to_list): + def match(self, from_list, to_list, **kwargs): # Calculate distances matches = [[fuzz.ratio(from_string, to_string) / 100 for to_string in to_list] for from_string in from_list] diff --git a/docs/api/models/gensim.md b/docs/api/models/gensim.md new file mode 100644 index 0000000..fd27f09 --- /dev/null +++ b/docs/api/models/gensim.md @@ -0,0 +1,3 @@ +# `polyfuzz.models.GensimEmbeddings` + +::: polyfuzz.models.GensimEmbeddings diff --git a/docs/api/models/sbert.md b/docs/api/models/sbert.md new file mode 100644 index 0000000..920e667 --- /dev/null +++ b/docs/api/models/sbert.md @@ -0,0 +1,3 @@ +# `polyfuzz.models.SentenceEmbeddings` + +::: polyfuzz.models.SentenceEmbeddings diff --git a/docs/api/models/spacy.md b/docs/api/models/spacy.md new file mode 100644 index 0000000..2915030 --- /dev/null +++ b/docs/api/models/spacy.md @@ -0,0 +1,3 @@ +# `polyfuzz.models.SpacyEmbeddings` + +::: polyfuzz.models.SpacyEmbeddings diff --git a/docs/api/models/use.md b/docs/api/models/use.md new file mode 100644 index 0000000..ce2d8af --- /dev/null +++ b/docs/api/models/use.md @@ -0,0 +1,3 @@ +# `polyfuzz.models.USEEmbeddings` + +::: polyfuzz.models.USEEmbeddings diff --git a/docs/index.md b/docs/index.md index 9862b0c..1338093 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,20 +8,4 @@ Currently, methods include Levenshtein distance with RapidFuzz, a character-base techniques such as FastText and GloVe, and 🤗 transformers embeddings. The philosophy of PolyFuzz is: `Easy to use yet highly customizable`. It is a string matcher tool that requires only -a few lines of code but that allows you customize and create your own models. - - -## Installation -You can install **`PolyFuzz`** via pip: - -``` -pip install polyfuzz -``` - -This will install the base dependencies and excludes any deep learning/embedding models. - -If you want to be making use of 🤗 Transformers, install the additional additional `Flair` dependency: - -``` -pip install polyfuzz[flair] -``` +a few lines of code but that allows you customize and create your own models. \ No newline at end of file diff --git a/docs/releases.md b/docs/releases.md index ae96288..e09c498 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,8 +1,83 @@ -v0.3.4 +## **v0.4.0** + + +* Added new models (SentenceTransformers, Gensim, USE, Spacy) +* Added `.fit`, `.transform`, and `.fit_transform` methods +* Added `.save` and `PolyFuzz.load()` + + +**SentenceTransformers** +```python +from polyfuzz.models import SentenceEmbeddings +distance_model = SentenceEmbeddings("all-MiniLM-L6-v2") +model = PolyFuzz(distance_model) +``` + +**Gensim** +```python +from polyfuzz.models import GensimEmbeddings +distance_model = GensimEmbeddings("glove-twitter-25") +model = PolyFuzz(distance_model) +``` + +**USE** +```python +from polyfuzz.models import USEEmbeddings +distance_model = USEEmbeddings("https://tfhub.dev/google/universal-sentence-encoder/4") +model = PolyFuzz(distance_model) +``` + +**Spacy** +```python +from polyfuzz.models import SpacyEmbeddings +distance_model = SpacyEmbeddings("en_core_web_md") +model = PolyFuzz(distance_model) +``` + + +**fit, transform, fit_transform** +Add `fit`, `transform`, and `fit_transform` in order to use PolyFuzz in production [#34](https://github.com/MaartenGr/PolyFuzz/issues/34) + +```python +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import CountVectorizer +from polyfuzz import PolyFuzz + +train_words = ["apple", "apples", "appl", "recal", "house", "similarity"] +unseen_words = ["apple", "apples", "mouse"] + +# Fit +model = PolyFuzz("TF-IDF") +model.fit(train_words) + +# Transform +results = model.transform(unseen_words) +``` + +In the code above, we fit our TF-IDF model on `train_words` and use `.transform()` to match the words in `unseen_words` to the words that we trained on in `train_words`. + +After fitting our model, we can save it as follows: + +```python +model.save("my_model") +``` + +Then, we can load our model to be used elsewhere: + +```python +from polyfuzz import PolyFuzz + +model = PolyFuzz.load("my_model") +``` + + +## **v0.3.4** + - Make sure that when you use two lists that are exactly the same, it will return 1 for identical terms: ```python from polyfuzz import PolyFuzz + from_list = ["apple", "house"] model = PolyFuzz("TF-IDF") model.match(from_list, from_list) @@ -14,6 +89,7 @@ mapping to itself: ```python from polyfuzz import PolyFuzz + from_list = ["apple", "apples"] model = PolyFuzz("TF-IDF") model.match(from_list) @@ -22,32 +98,32 @@ model.match(from_list) In the example above, `apple` will be mapped to `apples` and not to `apple`. Here, we assume that the user wants to find the most similar words within a list without mapping to itself. -v0.3.3 +## **v0.3.3** - Update numpy to "numpy>=1.20.0" to prevent [this](https://github.com/MaartenGr/PolyFuzz/issues/23) and this [issue](https://github.com/MaartenGr/PolyFuzz/issues/21) - Update pytorch to "torch>=1.4.0,<1.7.1" to prevent save_state_warning error -v0.3.2 +## **v0.3.2** - Fix exploding memory usage when using `top_n` -v0.3.0 +## **v0.3.0** - Use `top_n` in `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` -v0.2.2 +## **v0.2.2** - Update grouping to include all strings only if identical lists of strings are compared -v0.2.0 +## **v0.2.0** - Update naming convention matcher --> model - Update documentation - Add basic models to grouper - Fix issues with vector order in cosine similarity - Update naming of cosine similarity function -v0.1.0 +## **v0.1.0** - Additional tests - More thorough documentation - Prepare for public release -v0.0.1 +## **v0.0.1** - First release of `PolyFuzz` - Matching through: - Edit Distance diff --git a/docs/tutorial/basematcher/basematcher.md b/docs/tutorial/basematcher/basematcher.md index 20c66fc..3091c5b 100644 --- a/docs/tutorial/basematcher/basematcher.md +++ b/docs/tutorial/basematcher/basematcher.md @@ -9,6 +9,7 @@ You simply create a class using `BaseMatcher`, make sure it has a function `matc two lists and outputs a pandas dataframe. That's it! We start by creating our own model that implements the ratio similarity measure from RapidFuzz: + ```python import numpy as np import pandas as pd @@ -19,7 +20,7 @@ from polyfuzz.models import BaseMatcher class MyModel(BaseMatcher): - def match(self, from_list, to_list): + def match(self, from_list, to_list, **kwargs): # Calculate distances matches = [[fuzz.ratio(from_string, to_string) / 100 for to_string in to_list] for from_string in from_list] @@ -53,3 +54,67 @@ model.visualize_precision_recall(kde=True) ``` ![](custom_model.png) + + +## fit, transform, fit_transform + +Although the above model can be used in production using `fit`, it does not track its state between `fit` and `transform`. +This is not necessary here, since edit distances should be recalculated but if you have embeddings that you do not +want to re-calculate, then it is helpful to track the states between `fit` and `transform` so that embeddings do not need +to be re-calculated. To do so, we can use the `re_train` parameter to define what happens if we re-train a model (for example when using `fit`) +and what happens when we do not re-train a model (for example when using `transform`). + +In the example below, when we set `re_train=True` we calculate the embeddings from both the `from_list` and `to_list` if they are defined +and save the embeddings to the `self.embeddings_to` variable. Then, when we set `re_train=True`, we can prevent redoing the `fit` by leveraging +the pre-calculated `self.embeddings_to` variable. + +```python +import numpy as np +from sentence_transformers import SentenceTransformer + +from ._utils import cosine_similarity +from ._base import BaseMatcher + + +class SentenceEmbeddings(BaseMatcher): + def __init__(self, model_id): + super().__init__(model_id) + self.type = "Embeddings" + + self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + self.embeddings_to = None + + def match(self, from_list, to_list, re_train=True) -> pd.DataFrame: + # Extract embeddings from the `from_list` + embeddings_from = self.embedding_model.encode(from_list, show_progress_bar=False) + + # Extract embeddings from the `to_list` if it exists + if not isinstance(embeddings_to, np.ndarray): + if not re_train: + embeddings_to = self.embeddings_to + elif to_list is None: + embeddings_to = self.embedding_model.encode(from_list, show_progress_bar=False) + else: + embeddings_to = self.embedding_model.encode(to_list, show_progress_bar=False) + + # Extract matches + matches = cosine_similarity(embeddings_from, embeddings_to, from_list, to_list) + + self.embeddings_to = embeddings_to + + return matches +``` + +Then, we can use it as follows: + +```python +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +custom_matcher = MyModel() + +model = PolyFuzz(custom_matcher).fit(from_list) +``` + +By using the `.fit` function, embeddings are created from the `from_list` variable and saved. Then, when we +run `model.transform(to_list)`, the embeddings created from the `from_list` variable do not need to be recalculated. \ No newline at end of file diff --git a/docs/tutorial/models/models.md b/docs/tutorial/models/models.md index f580bf6..89627f0 100644 --- a/docs/tutorial/models/models.md +++ b/docs/tutorial/models/models.md @@ -4,10 +4,13 @@ Currently, the following models are implemented in PolyFuzz: 2. EditDistance with RapidFuzz 3. FastText and GloVe 4. 🤗 Transformers +5. SentenceTransformers +6. Gensim +7. Spacy With `Flair`, we can use all 🤗 Transformers that are [publicly available](https://huggingface.co/transformers/pretrained_models.html). -We simply have to instantiate any Flair WordEmbedding method and pass it through PolyFuzzy. +We simply have to instantiate any Flair WordEmbedding method and pass it through PolyFuzz. All models listed above can be found in `polyfuzz.models` and can be used to create and compare different matchers. @@ -78,7 +81,7 @@ With `Flair`, we can use all 🤗 Transformers that are The embeddings that are created are compared with cosine similarity in order to understand how similar the created embeddings are to each other. -We simply have to instantiate any Flair WordEmbedding method and pass it through PolyFuzzy: +We simply have to instantiate any Flair WordEmbedding method and pass it through PolyFuzz: ```python from polyfuzz import PolyFuzz @@ -117,6 +120,142 @@ matchers = [bert_matcher, fasttext_matcher] models = PolyFuzz(matchers).match(from_list, to_list) ``` +## SentenceTransformers +We can use `sentence-transformers` to generate embeddings from our input list and find the closest matching +entities using cosine similarity. We simply have to instantiate our `SentenceEmbeddings` class and pass it to PolyFuzz: + + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import SentenceEmbeddings + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +distance_model = SentenceEmbeddings("all-MiniLM-L6-v2") +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +For a full list of possible models, click [this](https://www.sbert.net/docs/pretrained_models.html) link. + +You can also use a custom SentenceTransformer model: + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import SentenceEmbeddings +from sentence_transformers import SentenceTransformer + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +embedding_model = SentenceTransformer("all-MiniLM-L6-v2") +distance_model = SentenceEmbeddings(embedding_model) +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +## Gensim +We can use `gensim` to load in a word embedding model to generate embeddings from our input list and find the closest matching +entities using cosine similarity. We simply have to instantiate our `GensimEmbeddings` class and pass it to PolyFuzz: + + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import GensimEmbeddings + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +distance_model = GensimEmbeddings("glove-twitter-25") +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +For a full list of possible models, click [this](https://github.com/RaRe-Technologies/gensim-data#models) link. + +You can also use a custom Gensim model: + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import GensimEmbeddings +import gensim.downloader as api + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +embedding_model = api.load("glove-twitter-25") +distance_model = GensimEmbeddings(embedding_model) +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +## Spacy +We can use `spacy` to load in an embedding model to generate embeddings from our input list and find the closest matching +entities using cosine similarity. We simply have to instantiate our `SpacyEmbeddings` class and pass it to PolyFuzz: + + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import SpacyEmbeddings + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +distance_model = SpacyEmbeddings("en_core_web_md") +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +For a full list of possible models, click [this](https://spacy.io/usage/models) link. + +You can also use a custom Spacy model: + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import SpacyEmbeddings +import spacy + + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +embedding_model = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) +distance_model = SpacyEmbeddings(embedding_model) +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +## Universal Sentence Encoder (USE) + +The Universal Sentence Encoder encodes text into high-dimensional vectors that are used here for embedding the strings. +The model is trained and optimized for greater-than-word length text, such as sentences, phrases, or short paragraphs. + +We simply have to instantiate our `USEEmbeddings` class and pass it to PolyFuzz: + + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import USEEmbeddings + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +distance_model = USEEmbeddings("https://tfhub.dev/google/universal-sentence-encoder/4") +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + +For a full list of possible models, click [this](https://spacy.io/usage/models) link. + +You can also use a custom USE model: + +```python +from polyfuzz import PolyFuzz +from polyfuzz.models import USEEmbeddings +import tensorflow_hub + +from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] +to_list = ["apple", "apples", "mouse"] + +embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") +distance_model = USEEmbeddings(embedding_model) +model = PolyFuzz(distance_model).match(from_list, to_list) +``` + ## Using Multiple Models ```python diff --git a/docs/tutorial/quickstart/quickstart.md b/docs/tutorial/quickstart/quickstart.md index f2ba0ac..eecda9b 100644 --- a/docs/tutorial/quickstart/quickstart.md +++ b/docs/tutorial/quickstart/quickstart.md @@ -5,12 +5,21 @@ You can install **`PolyFuzz`** via pip: pip install polyfuzz ``` -This will install the base dependencies and excludes any deep learning/embedding models. +You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: -If you want to be making use of 🤗 Transformers, install the additional additional `Flair` dependency: +```python +pip install bertopic[sbert] +pip install bertopic[flair] +pip install bertopic[gensim] +pip install bertopic[spacy] +pip install bertopic[use] +``` + +If you want to speed up the cosine similarity comparison and decrease memory usage when using embedding models, +you can use `sparse_dot_topn` which is installed via: ```bash -pip install polyfuzz[flair] +pip install polyfuzz[fast] ``` ## Getting Started @@ -47,6 +56,44 @@ The resulting matches can be accessed through `model.get_matches()`: **NOTE**: When instantiating `PolyFuzz` we also could have used "EditDistance" or "Embeddings" to quickly access Levenshtein and FastText (English) respectively. +### Fit / Transform +The `.match` function allows you to quickly extract similar strings. However, after selecting the right models to be used, you may want to use PolyFuzz +in production to match incoming strings. To do so, we can make use of the familiar `fit`, `transform`, and `fit_transform` functions. + +Let's say that we have a list of words that we know to be correct called `train_words`. We want to any incoming word to mapped to one of the words in `train_words`. +In other words, we `fit` on `train_words` and we use `transform` on any incoming words: + +```python +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import CountVectorizer +from polyfuzz import PolyFuzz + +train_words = ["apple", "apples", "appl", "recal", "house", "similarity"] +unseen_words = ["apple", "apples", "mouse"] + +# Fit +model = PolyFuzz("TF-IDF") +model.fit(train_words) + +# Transform +results = model.transform(unseen_words) +``` + +In the above example, we are using `fit` on `train_words` to calculate the TF-IDF representation of those words which are saved to be used again in `transform`. +This speeds up `transform` quite a bit since all TF-IDF representations are stored when applying `fit`. + +### Save / Load + +We can save and load the model as follows to be used in production: + +```python +# Save the model +model.save("my_model") + +# Load the model +loaded_model = PolyFuzz.load("my_model") +``` + ### Group Matches We can group the matches `To` as there might be significant overlap in strings in our to_list. To do this, we calculate the similarity within strings in to_list and use `single linkage` to then diff --git a/mkdocs.yml b/mkdocs.yml index 74d4ab1..43dbbee 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -23,6 +23,10 @@ nav: - RapidFuzzy: api/models/rapidfuzz.md - TFIDF: api/models/tfidf.md - Embeddings: api/models/embeddings.md + - Spacy: api/models/spacy.md + - SBERT: api/models/sbert.md + - Gensim: api/models/gensim.md + - USE: api/models/use.md - CosineSimilarity: api/models/matches.md - Releases: releases.md plugins: diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py index d587adc..916525d 100644 --- a/polyfuzz/__init__.py +++ b/polyfuzz/__init__.py @@ -1,2 +1,2 @@ from .polyfuzz import PolyFuzz -__version__ = "0.3.4" +__version__ = "0.4.0" diff --git a/polyfuzz/models/__init__.py b/polyfuzz/models/__init__.py index abccfed..c6bebc4 100644 --- a/polyfuzz/models/__init__.py +++ b/polyfuzz/models/__init__.py @@ -11,10 +11,35 @@ except ModuleNotFoundError as e: Embeddings = NotInstalled("Flair and Huggingface Transformer Models", "flair") +try: + from._sbert import SentenceEmbeddings +except ModuleNotFoundError as e: + SentenceEmbeddings = NotInstalled("SentenceTransformers", "sbert") + +try: + from._gensim import GensimEmbeddings +except ModuleNotFoundError as e: + GensimEmbeddings = NotInstalled("Gensim", "gensim") + +try: + from._spacy import SpacyEmbeddings +except ModuleNotFoundError as e: + SpacyEmbeddings = NotInstalled("Spacy", "spacy") + +try: + from._use import USEEmbeddings +except ModuleNotFoundError as e: + USEEmbeddings = NotInstalled("USE", "use") + + __all__ = [ "BaseMatcher", "EditDistance", "Embeddings", + "SentenceEmbeddings", + "GensimEmbeddings", + "SpacyEmbeddings", + "USEEmbeddings", "RapidFuzz", "TFIDF", "cosine_similarity" diff --git a/polyfuzz/models/_base.py b/polyfuzz/models/_base.py index 461935e..1a97ba7 100644 --- a/polyfuzz/models/_base.py +++ b/polyfuzz/models/_base.py @@ -13,7 +13,8 @@ def __init__(self, model_id: str = "Model 0"): @abstractmethod def match(self, from_list: List[str], - to_list: List[str] = None) -> pd.DataFrame: + to_list: List[str] = None, + **kwargs) -> pd.DataFrame: """ Make sure you follow the same argument structure: Arguments: @@ -28,4 +29,3 @@ def match(self, * "Similarity" """ raise NotImplementedError() - diff --git a/polyfuzz/models/_distance.py b/polyfuzz/models/_distance.py index fd29cd1..580b799 100644 --- a/polyfuzz/models/_distance.py +++ b/polyfuzz/models/_distance.py @@ -45,7 +45,8 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str] = None) -> pd.DataFrame: + to_list: List[str] = None, + **kwargs) -> pd.DataFrame: """ Calculate the edit distances between two list of strings by parallelizing the calculation and passing the lists in batches. diff --git a/polyfuzz/models/_embeddings.py b/polyfuzz/models/_embeddings.py index 26d19b4..bfedcd3 100644 --- a/polyfuzz/models/_embeddings.py +++ b/polyfuzz/models/_embeddings.py @@ -82,11 +82,14 @@ def __init__(self, self.top_n = top_n self.cosine_method = cosine_method + self.embeddings_to = None + def match(self, from_list: List[str], to_list: List[str] = None, embeddings_from: np.ndarray = None, - embeddings_to: np.ndarray = None) -> pd.DataFrame: + embeddings_to: np.ndarray = None, + re_train: bool = True) -> pd.DataFrame: """ Matches the two lists of strings to each other and returns the best mapping Arguments: @@ -94,6 +97,8 @@ def match(self, to_list: The list where you want to map to embeddings_from: Embeddings you created yourself from the `from_list` embeddings_to: Embeddings you created yourself from the `to_list` + re_train: Whether to re-train the model with new embeddings + Set this to False if you want to use this model in production Returns: matches: The best matches between the lists of strings @@ -106,10 +111,15 @@ def match(self, ["string_three", "string_four"]) ``` """ + # Extract embeddings from the `from_list` if not isinstance(embeddings_from, np.ndarray): embeddings_from = self._embed(from_list) + + # Extract embeddings from the `to_list` if it exists if not isinstance(embeddings_to, np.ndarray): - if to_list is None: + if not re_train: + embeddings_to = self.embeddings_to + elif to_list is None: embeddings_to = self._embed(from_list) else: embeddings_to = self._embed(to_list) @@ -120,6 +130,8 @@ def match(self, top_n=self.top_n, method=self.cosine_method) + self.embeddings_to = embeddings_to + return matches def _embed(self, strings: List[str]) -> np.ndarray: diff --git a/polyfuzz/models/_gensim.py b/polyfuzz/models/_gensim.py new file mode 100644 index 0000000..e1bedc1 --- /dev/null +++ b/polyfuzz/models/_gensim.py @@ -0,0 +1,140 @@ +import numpy as np +import pandas as pd +from typing import List, Union +from gensim.models.keyedvectors import Word2VecKeyedVectors + +from ._utils import cosine_similarity +from ._base import BaseMatcher + + +class GensimEmbeddings(BaseMatcher): + """ + Embed words into vectors and use cosine similarity to find + the best matches between two lists of strings + + Arguments: + embedding_model: The Gensim model to use, this can be either a string or the model directly + min_similarity: The minimum similarity between strings, otherwise return 0 similarity + top_n: The number of best matches you want returned + cosine_method: The method/package for calculating the cosine similarity. + Options: "sparse", "sklearn", "knn". + Sparse is the fastest and most memory efficient but requires a + package that might be difficult to install. + Sklearn is a bit slower than sparse and requires significantly more memory as + the distance matrix is not sparse + Knn uses 1-nearest neighbor to extract the most similar strings + it is significantly slower than both methods but requires little memory + model_id: The name of the particular instance, used when comparing models + + Usage: + + ```python + distance_model = GensimEmbeddings("fasttext-wiki-news-subwords-300", min_similarity=0.5) + ``` + + Or if you want to directly pass a Gensim model: + + ```python + import gensim.downloader as api + embedding_model = api.load("fasttext-wiki-news-subwords-300") + distance_model = GensimEmbeddings(embedding_model, min_similarity=0.5) + ``` + """ + def __init__(self, + embedding_model: Union[str, Word2VecKeyedVectors] = "fasttext-wiki-news-subwords-300", + min_similarity: float = 0.75, + top_n: int = 1, + cosine_method: str = "sparse", + model_id: str = None): + super().__init__(model_id) + self.type = "Embeddings" + + if isinstance(embedding_model, Word2VecKeyedVectors): + self.embedding_model = embedding_model + elif isinstance(embedding_model, str): + import gensim.downloader as api + self.embedding_model = api.load(embedding_model) + else: + raise ValueError("Please select a correct Gensim model: \n" + "`import gensim.downloader as api` \n" + "`ft = api.load('fasttext-wiki-news-subwords-300')`") + + self.min_similarity = min_similarity + self.top_n = top_n + self.cosine_method = cosine_method + + self.embeddings_to = None + + def match(self, + from_list: List[str], + to_list: List[str] = None, + embeddings_from: np.ndarray = None, + embeddings_to: np.ndarray = None, + re_train: bool = True) -> pd.DataFrame: + """ Matches the two lists of strings to each other and returns the best mapping + + Arguments: + from_list: The list from which you want mappings + to_list: The list where you want to map to + embeddings_from: Embeddings you created yourself from the `from_list` + embeddings_to: Embeddings you created yourself from the `to_list` + re_train: Whether to re-train the model with new embeddings + Set this to False if you want to use this model in production + + Returns: + matches: The best matches between the lists of strings + + Usage: + + ```python + model = Embeddings(min_similarity=0.5) + matches = model.match(["string_one", "string_two"], + ["string_three", "string_four"]) + ``` + """ + # Extract embeddings from the `from_list` + if not isinstance(embeddings_from, np.ndarray): + embeddings_from = self._embed(from_list) + + # Extract embeddings from the `to_list` if it exists + if not isinstance(embeddings_to, np.ndarray): + if not re_train: + embeddings_to = self.embeddings_to + elif to_list is None: + embeddings_to = self._embed(from_list) + else: + embeddings_to = self._embed(to_list) + + matches = cosine_similarity(embeddings_from, embeddings_to, + from_list, to_list, + self.min_similarity, + top_n=self.top_n, + method=self.cosine_method) + + self.embeddings_to = embeddings_to + + return matches + + def _embed(self, strings: List[str]) -> np.ndarray: + """ Create embeddings from a list of strings """ + vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0] + empty_vector = np.zeros(vector_shape) + + embeddings = [] + for doc in strings: + doc_embedding = [] + + # Extract word embeddings + for word in doc.split(" "): + try: + word_embedding = self.embedding_model.get_vector(word) + doc_embedding.append(word_embedding) + except KeyError: + doc_embedding.append(empty_vector) + + # Pool word embeddings + doc_embedding = np.mean(doc_embedding, axis=0) + embeddings.append(doc_embedding) + + embeddings = np.array(embeddings) + return embeddings diff --git a/polyfuzz/models/_rapidfuzz.py b/polyfuzz/models/_rapidfuzz.py index 32dc784..7aefed4 100644 --- a/polyfuzz/models/_rapidfuzz.py +++ b/polyfuzz/models/_rapidfuzz.py @@ -60,7 +60,8 @@ def __init__(self, def match(self, from_list: List[str], - to_list: List[str] = None) -> pd.DataFrame: + to_list: List[str] = None, + **kwargs) -> pd.DataFrame: """ Calculate the edit distances between two list of strings by parallelizing the calculation and passing the lists in batches. diff --git a/polyfuzz/models/_sbert.py b/polyfuzz/models/_sbert.py new file mode 100644 index 0000000..a6e7f7b --- /dev/null +++ b/polyfuzz/models/_sbert.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas as pd +from typing import List, Union +from sentence_transformers import SentenceTransformer + +from ._utils import cosine_similarity +from ._base import BaseMatcher + + +class SentenceEmbeddings(BaseMatcher): + """ + Embed words into vectors and use cosine similarity to find + the best matches between two lists of strings + + Arguments: + embedding_model: The sbert model to use, this can be either a string or the model directly + min_similarity: The minimum similarity between strings, otherwise return 0 similarity + top_n: The number of best matches you want returned + cosine_method: The method/package for calculating the cosine similarity. + Options: "sparse", "sklearn", "knn". + Sparse is the fastest and most memory efficient but requires a + package that might be difficult to install. + Sklearn is a bit slower than sparse and requires significantly more memory as + the distance matrix is not sparse + Knn uses 1-nearest neighbor to extract the most similar strings + it is significantly slower than both methods but requires little memory + model_id: The name of the particular instance, used when comparing models + + Usage: + + ```python + distance_model = SentenceEmbeddings("all-MiniLM-L6-v2", min_similarity=0.5) + ``` + + Or if you want to directly pass a sbert model: + + ```python + from sentence_transformers import SentenceTransformer + embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + distance_model = SentenceEmbeddings(embedding_model, min_similarity=0.5) + ``` + """ + def __init__(self, + embedding_model: Union[str, SentenceTransformer] = "all-MiniLM-L6-v2", + min_similarity: float = 0.75, + top_n: int = 1, + cosine_method: str = "sparse", + model_id: str = None): + super().__init__(model_id) + self.type = "Embeddings" + + if isinstance(embedding_model, SentenceTransformer): + self.embedding_model = embedding_model + elif isinstance(embedding_model, str): + self.embedding_model = SentenceTransformer(embedding_model) + else: + raise ValueError("Please select a correct SentenceTransformers model: \n" + "`from sentence_transformers import SentenceTransformer` \n" + "`embedding_model = SentenceTransformer('all-MiniLM-L6-v2')`") + + self.min_similarity = min_similarity + self.top_n = top_n + self.cosine_method = cosine_method + + self.embeddings_to = None + + def match(self, + from_list: List[str], + to_list: List[str] = None, + embeddings_from: np.ndarray = None, + embeddings_to: np.ndarray = None, + re_train: bool = True) -> pd.DataFrame: + """ Matches the two lists of strings to each other and returns the best mapping + + Arguments: + from_list: The list from which you want mappings + to_list: The list where you want to map to + embeddings_from: Embeddings you created yourself from the `from_list` + embeddings_to: Embeddings you created yourself from the `to_list` + re_train: Whether to re-train the model with new embeddings + Set this to False if you want to use this model in production + + Returns: + matches: The best matches between the lists of strings + + Usage: + + ```python + model = Embeddings(min_similarity=0.5) + matches = model.match(["string_one", "string_two"], + ["string_three", "string_four"]) + ``` + """ + # Extract embeddings from the `from_list` + embeddings_from = self.embedding_model.encode(from_list, show_progress_bar=False) + + # Extract embeddings from the `to_list` if it exists + if not isinstance(embeddings_to, np.ndarray): + if not re_train: + embeddings_to = self.embeddings_to + elif to_list is None: + embeddings_to = self.embedding_model.encode(from_list, show_progress_bar=False) + else: + embeddings_to = self.embedding_model.encode(to_list, show_progress_bar=False) + + matches = cosine_similarity(embeddings_from, embeddings_to, + from_list, to_list, + self.min_similarity, + top_n=self.top_n, + method=self.cosine_method) + + self.embeddings_to = embeddings_to + + return matches diff --git a/polyfuzz/models/_spacy.py b/polyfuzz/models/_spacy.py new file mode 100644 index 0000000..dc5dcd1 --- /dev/null +++ b/polyfuzz/models/_spacy.py @@ -0,0 +1,140 @@ +import numpy as np +import pandas as pd +from typing import List +import spacy + +from ._utils import cosine_similarity +from ._base import BaseMatcher + + +class SpacyEmbeddings(BaseMatcher): + """ + Embed words into vectors and use cosine similarity to find + the best matches between two lists of strings + + Arguments: + embedding_model: The Spacy model to use, this can be either a string or the model directly + min_similarity: The minimum similarity between strings, otherwise return 0 similarity + top_n: The number of best matches you want returned + cosine_method: The method/package for calculating the cosine similarity. + Options: "sparse", "sklearn", "knn". + Sparse is the fastest and most memory efficient but requires a + package that might be difficult to install. + Sklearn is a bit slower than sparse and requires significantly more memory as + the distance matrix is not sparse + Knn uses 1-nearest neighbor to extract the most similar strings + it is significantly slower than both methods but requires little memory + model_id: The name of the particular instance, used when comparing models + + Usage: + + ```python + distance_model = SpacyEmbeddings("en_core_web_md", min_similarity=0.5) + ``` + + Or if you want to directly pass a Spacy model: + + ```python + import spacy + embedding_model = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + distance_model = SpacyEmbeddings(embedding_model, min_similarity=0.5) + ``` + """ + def __init__(self, + embedding_model = "en_core_web_md", + min_similarity: float = 0.75, + top_n: int = 1, + cosine_method: str = "sparse", + model_id: str = None): + super().__init__(model_id) + self.type = "Embeddings" + + if isinstance(embedding_model, str): + self.embedding_model = spacy.load(embedding_model, exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + elif "spacy" in str(type(embedding_model)): + self.embedding_model = embedding_model + else: + raise ValueError("Please select a correct Spacy model by either using a string such as 'en_core_web_md' " + "or create a nlp model using: `nlp = spacy.load('en_core_web_md')") + + self.min_similarity = min_similarity + self.top_n = top_n + self.cosine_method = cosine_method + + self.embeddings_to = None + + def match(self, + from_list: List[str], + to_list: List[str] = None, + embeddings_from: np.ndarray = None, + embeddings_to: np.ndarray = None, + re_train: bool = True) -> pd.DataFrame: + """ Matches the two lists of strings to each other and returns the best mapping + + Arguments: + from_list: The list from which you want mappings + to_list: The list where you want to map to + embeddings_from: Embeddings you created yourself from the `from_list` + embeddings_to: Embeddings you created yourself from the `to_list` + re_train: Whether to re-train the model with new embeddings + Set this to False if you want to use this model in production + + Returns: + matches: The best matches between the lists of strings + + Usage: + + ```python + model = Embeddings(min_similarity=0.5) + matches = model.match(["string_one", "string_two"], + ["string_three", "string_four"]) + ``` + """ + # Extract embeddings from the `from_list` + if not isinstance(embeddings_from, np.ndarray): + embeddings_from = self._embed(from_list) + + # Extract embeddings from the `to_list` if it exists + if not isinstance(embeddings_to, np.ndarray): + if not re_train: + embeddings_to = self.embeddings_to + elif to_list is None: + embeddings_to = self._embed(from_list) + else: + embeddings_to = self._embed(to_list) + + matches = cosine_similarity(embeddings_from, embeddings_to, + from_list, to_list, + self.min_similarity, + top_n=self.top_n, + method=self.cosine_method) + + self.embeddings_to = embeddings_to + + return matches + + def _embed(self, strings: List[str]) -> np.ndarray: + """ Create embeddings from a list of strings """ + # Extract embeddings from a transformer model + if "transformer" in self.embedding_model.component_names: + embeddings = [] + for doc in strings: + try: + embedding = self.embedding_model(doc)._.trf_data.tensors[-1][0].tolist() + except: + embedding = self.embedding_model("An empty document")._.trf_data.tensors[-1][0].tolist() + embeddings.append(embedding) + embeddings = np.array(embeddings) + + # Extract embeddings from a general spacy model + else: + embeddings = [] + for doc in strings: + try: + vector = self.embedding_model(doc).vector + except ValueError: + vector = self.embedding_model("An empty document").vector + embeddings.append(vector) + embeddings = np.array(embeddings) + + return embeddings diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py index d994f81..e065f11 100644 --- a/polyfuzz/models/_tfidf.py +++ b/polyfuzz/models/_tfidf.py @@ -59,15 +59,20 @@ def __init__(self, self.min_similarity = min_similarity self.cosine_method = cosine_method self.top_n = top_n + self.vectorizer = None + self.tf_idf_to = None def match(self, from_list: List[str], - to_list: List[str] = None) -> pd.DataFrame: + to_list: List[str] = None, + re_train: bool = True) -> pd.DataFrame: """ Match two lists of strings to each other and return the most similar strings Arguments: from_list: The list from which you want mappings to_list: The list where you want to map to + re_train: Whether to re-train the model with new embeddings + Set this to False if you want to use this model in production Returns: matches: The best matches between the lists of strings @@ -82,7 +87,7 @@ def match(self, ``` """ - tf_idf_from, tf_idf_to = self._extract_tf_idf(from_list, to_list) + tf_idf_from, tf_idf_to = self._extract_tf_idf(from_list, to_list, re_train) matches = cosine_similarity(tf_idf_from, tf_idf_to, from_list, to_list, self.min_similarity, @@ -93,17 +98,21 @@ def match(self, def _extract_tf_idf(self, from_list: List[str], - to_list: List[str] = None) -> Tuple[np.ndarray, np.ndarray]: + to_list: List[str] = None, + re_train: bool = True) -> Tuple[np.ndarray, np.ndarray]: """ Calculate distances between TF-IDF vectors of from_list and to_list """ if to_list: - vectorizer = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit(to_list + from_list) - tf_idf_to = vectorizer.transform(to_list) - tf_idf_from = vectorizer.transform(from_list) + if re_train: + self.vectorizer = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit(to_list + from_list) + self.tf_idf_to = self.vectorizer.transform(to_list) + tf_idf_from = self.vectorizer.transform(from_list) else: - tf_idf_to = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit_transform(from_list) - tf_idf_from = tf_idf_to + if re_train: + self.vectorizer = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit(from_list) + self.tf_idf_to = self.vectorizer.transform(from_list) + tf_idf_from = self.tf_idf_to - return tf_idf_from, tf_idf_to + return tf_idf_from, self.tf_idf_to def _create_ngrams(self, string: str) -> List[str]: """ Create n_grams from a string diff --git a/polyfuzz/models/_use.py b/polyfuzz/models/_use.py new file mode 100644 index 0000000..b2fc432 --- /dev/null +++ b/polyfuzz/models/_use.py @@ -0,0 +1,122 @@ +import numpy as np +import pandas as pd +from typing import List +import tensorflow_hub + +from ._utils import cosine_similarity +from ._base import BaseMatcher + + +class USEEmbeddings(BaseMatcher): + """ + Embed words into vectors and use cosine similarity to find + the best matches between two lists of strings + + Arguments: + embedding_model: The USE model to use, this can be either a string or the model directly + min_similarity: The minimum similarity between strings, otherwise return 0 similarity + top_n: The number of best matches you want returned + cosine_method: The method/package for calculating the cosine similarity. + Options: "sparse", "sklearn", "knn". + Sparse is the fastest and most memory efficient but requires a + package that might be difficult to install. + Sklearn is a bit slower than sparse and requires significantly more memory as + the distance matrix is not sparse + Knn uses 1-nearest neighbor to extract the most similar strings + it is significantly slower than both methods but requires little memory + model_id: The name of the particular instance, used when comparing models + + Usage: + + ```python + distance_model = USEEmbeddings("https://tfhub.dev/google/universal-sentence-encoder/4", min_similarity=0.5) + ``` + + Or if you want to directly pass a USE model: + + ```python + import tensorflow_hub + embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") + distance_model = UseEmbeddings(embedding_model, min_similarity=0.5) + ``` + """ + def __init__(self, + embedding_model = "https://tfhub.dev/google/universal-sentence-encoder/4", + min_similarity: float = 0.75, + top_n: int = 1, + cosine_method: str = "sparse", + model_id: str = None): + super().__init__(model_id) + self.type = "Embeddings" + + if isinstance(embedding_model, str): + embedding_model = tensorflow_hub.load(embedding_model) + else: + try: + embedding_model(["test sentence"]) + self.embedding_model = embedding_model + except TypeError: + raise ValueError("Please select a correct USE model: \n" + "`import tensorflow_hub` \n" + "`embedding_model = tensorflow_hub.load(path_to_model)`") + + self.min_similarity = min_similarity + self.top_n = top_n + self.cosine_method = cosine_method + + self.embeddings_to = None + + def match(self, + from_list: List[str], + to_list: List[str] = None, + embeddings_from: np.ndarray = None, + embeddings_to: np.ndarray = None, + re_train: bool = True) -> pd.DataFrame: + """ Matches the two lists of strings to each other and returns the best mapping + + Arguments: + from_list: The list from which you want mappings + to_list: The list where you want to map to + embeddings_from: Embeddings you created yourself from the `from_list` + embeddings_to: Embeddings you created yourself from the `to_list` + re_train: Whether to re-train the model with new embeddings + Set this to False if you want to use this model in production + + Returns: + matches: The best matches between the lists of strings + + Usage: + + ```python + model = Embeddings(min_similarity=0.5) + matches = model.match(["string_one", "string_two"], + ["string_three", "string_four"]) + ``` + """ + # Extract embeddings from the `from_list` + if not isinstance(embeddings_from, np.ndarray): + embeddings_from = self._embed(from_list) + + # Extract embeddings from the `to_list` if it exists + if not isinstance(embeddings_to, np.ndarray): + if not re_train: + embeddings_to = self.embeddings_to + elif to_list is None: + embeddings_to = self._embed(from_list) + else: + embeddings_to = self._embed(to_list) + + matches = cosine_similarity(embeddings_from, embeddings_to, + from_list, to_list, + self.min_similarity, + top_n=self.top_n, + method=self.cosine_method) + + self.embeddings_to = embeddings_to + + return matches + + def _embed(self,strings: List[str]) -> np.ndarray: + """ Create embeddings from a list of strings """ + embeddings = np.array([self.embedding_model([doc]).cpu().numpy()[0] for doc in strings]) + return embeddings diff --git a/polyfuzz/polyfuzz.py b/polyfuzz/polyfuzz.py index b0b7b59..0327db4 100644 --- a/polyfuzz/polyfuzz.py +++ b/polyfuzz/polyfuzz.py @@ -1,3 +1,4 @@ +import joblib import logging import pandas as pd from typing import List, Mapping, Union, Iterable @@ -49,7 +50,7 @@ class PolyFuzz: model = pf.PolyFuzz([tfidf, edit]) ``` - To use embedding models, please use Flair word embeddings: + You can use embedding model, like Flair: ```python from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings @@ -122,11 +123,14 @@ def match(self, # Standard models - quick access if isinstance(self.method, str): if self.method in ["TF-IDF", "TFIDF"]: - self.matches = {"TF-IDF": TFIDF(min_similarity=0, top_n=top_n).match(from_list, to_list)} + self.method = TFIDF(min_similarity=0, top_n=top_n) + self.matches = {"TF-IDF": self.method.match(from_list, to_list)} elif self.method in ["EditDistance", "Edit Distance"]: - self.matches = {"EditDistance": RapidFuzz().match(from_list, to_list)} + self.method = RapidFuzz() + self.matches = {"EditDistance": self.method.match(from_list, to_list)} elif self.method in ["Embeddings", "Embedding"]: - self.matches = {"Embeddings": Embeddings(min_similarity=0, top_n=top_n).match(from_list, to_list)} + self.method = Embeddings(min_similarity=0, top_n=top_n) + self.matches = {"Embeddings": self.method.match(from_list, to_list)} else: raise ValueError("Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" @@ -149,6 +153,136 @@ def match(self, return self + def fit(self, + from_list: List[str], + to_list: List[str] = None): + """ Fit one or model distance models on `from_list` if no `to_list` is given + or fit them on `to_list` if both `from_list` and `to_list` are given. + + Typically, the `to_list` will be tracked as the list that we want to transform + our `from_list` to. In other words, it is the golden list of words that we + want the words in the `from_list` mapped to. + + However, you can also choose a single `from_list` and leave `to_list` empty + to map all words from within `from_list` to each other. Then, `from_list` + will be tracked instead as the golden list of words. + + Thus, if you want to train on a single list instead, use only `from_list` + and keep `to_list` empty. + + Arguments: + from_list: The list from which you want mappings. + If you want to map items within a list, and not map the + items to themselves, you can supply only the `from_list` and + ignore the `to_list`. + to_list: The list where you want to map to + + Usage: + + After having initialized your models, you can pass through lists of strings: + + ```python + import polyfuzz as pf + model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") + model.fit(from_list = ["string_one", "string_two"], + to_list = ["string_three", "string_four"]) + ``` + + Now, whenever you apply `.transform(new_list)`, the `new_list` will be mapped + to the words in `to_list`. + + You can also fit on a single list of words: + + ```python + import polyfuzz as pf + model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") + model.fit(["string_three", "string_four"]) + ``` + """ + self.match(from_list, to_list) + if to_list is not None: + self.to_list = to_list + else: + self.to_list = from_list + return self + + def transform(self, from_list: List[str]) -> Mapping[str, pd.DataFrame]: + """ After fitting your model, match all words in `from_list` + to the words that were fitted on previously. + + Arguments: + from_list: The list from which you want mappings. + + Usage: + + After having initialized your models, you can pass through lists of strings: + + ```python + import polyfuzz as pf + model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") + model.fit(["input_string_1", "input_string2"]) + ``` + + Then, you can transform and normalize new strings: + + ```python + results = model.transform(["input_string_1", "input_string2"]) + ``` + """ + all_matches = {} + + if isinstance(self.method, BaseMatcher): + matches = self.method.match(from_list, self.to_list, re_train=False) + all_matches[self.method.type] = matches + + elif isinstance(self.method, Iterable): + for model in self.method: + all_matches[model.type] = model.match(from_list, self.to_list, re_train=False) + + return all_matches + + def fit_transform(self, + from_list: List[str], + to_list: List[str] = None) -> Mapping[str, pd.DataFrame]: + """ Fit and transform lists of words on one or more distance models. + + Typically, the `to_list` will be tracked as the list that we want to transform + our `from_list` to. In other words, it is the golden list of words that we + want the words in the `from_list` mapped to. + + However, you can also choose a single `from_list` and leave `to_list` empty + to map all words from within `from_list` to each other. Then, `from_list` + will be tracked instead as the golden list of words. + + Arguments: + from_list: The list from which you want mappings. + If you want to map items within a list, and not map the + items to themselves, you can supply only the `from_list` and + ignore the `to_list`. + to_list: The list where you want to map to + + Usage: + + After having initialized your models, you can pass through lists of strings: + + ```python + import polyfuzz as pf + model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") + results = model.fit_transform(from_list = ["string_one", "string_two"], + to_list = ["string_three", "string_four"]) + ``` + + You can also fit and transform a single list of words: + + ```python + import polyfuzz as pf + model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") + results = model.fit_transform(["string_three", "string_four"]) + ``` + """ + self.fit(from_list, to_list) + return self.transform(from_list) + def visualize_precision_recall(self, kde: bool = False, save_path: str = None @@ -292,6 +426,36 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]: return self.cluster_mappings + def save(self, path: str) -> None: + """ Saves the model to the specified path + + Arguments: + path: the location and name of the file you want to save + + Usage: + ```python + model.save("my_model") + ``` + """ + with open(path, 'wb') as file: + joblib.dump(self, file) + + @classmethod + def load(cls, path: str): + """ Loads the model from the specified path + + Arguments: + path: the location and name of the PolyFuzz file you want to load + + Usage: + ```python + PolyFuzz.load("my_model") + ``` + """ + with open(path, 'rb') as file: + model = joblib.load(file) + return model + def _create_groups(self, name: str, model: BaseMatcher, diff --git a/setup.py b/setup.py index 8af809d..e960848 100644 --- a/setup.py +++ b/setup.py @@ -24,10 +24,35 @@ "scikit_learn>= 0.22.2.post1" ] -fast_cosine = ["sparse_dot_topn>=0.2.9"] -embeddings_packages = ["torch>=1.4.0,<1.7.1", "flair>= 0.7"] +gensim_packages = [ + "gensim>=4.0.0" +] + +sbert_packages = [ + "sentence-transformers>=0.4.1" +] + +fast_cosine = [ + "sparse_dot_topn>=0.2.9" +] + +embeddings_packages = [ + "torch>=1.4.0,<1.7.1", + "flair>= 0.7" +] + +spacy_packages = [ + "spacy>=3.0.1" +] + +use_packages = [ + "tensorflow", + "tensorflow_hub", + "tensorflow_text" +] + -extra_packages = embeddings_packages + fast_cosine +extra_packages = embeddings_packages + fast_cosine + sbert_packages + spacy_packages + use_packages dev_packages = docs_packages + test_packages + extra_packages @@ -37,7 +62,7 @@ setup( name="polyfuzz", packages=find_packages(exclude=["notebooks", "docs"]), - version="0.3.4", + version="0.4.0", author="Maarten Grootendorst", author_email="maartengrootendorst@gmail.com", description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.", @@ -72,7 +97,9 @@ "dev": dev_packages, "flair": embeddings_packages, "fast": fast_cosine, - "all": extra_packages + "sbert": sbert_packages, + "use": use_packages, + "gensim": gensim_packages, }, python_requires='>=3.6', ) diff --git a/tests/test_polyfuzz.py b/tests/test_polyfuzz.py index 15fa920..1cec4a9 100644 --- a/tests/test_polyfuzz.py +++ b/tests/test_polyfuzz.py @@ -4,28 +4,40 @@ from rapidfuzz import fuzz from polyfuzz import PolyFuzz -from polyfuzz.models import EditDistance, TFIDF, RapidFuzz, BaseMatcher +from polyfuzz.models import EditDistance, TFIDF, RapidFuzz, BaseMatcher, SentenceEmbeddings from tests.utils import get_test_strings from_list, to_list = get_test_strings() +sentence_model = SentenceEmbeddings("all-MiniLM-L6-v2") + + +import numpy as np +import pandas as pd +from rapidfuzz import fuzz + +from polyfuzz import PolyFuzz +from polyfuzz.models import BaseMatcher class MyModel(BaseMatcher): - def match(self, from_list, to_list): + def match(self, from_list, to_list, **kwargs): # Calculate distances - matches = [[fuzz.ratio(from_string, to_string) / 100 for to_string in to_list] for from_string in from_list] - + matches = [[fuzz.ratio(from_string, to_string) / 100 + for to_string in to_list] for from_string in from_list] + # Get best matches mappings = [to_list[index] for index in np.argmax(matches, axis=1)] scores = np.max(matches, axis=1) - + # Prepare dataframe - matches = pd.DataFrame({'From': from_list, 'To': mappings, 'Similarity': scores}) + matches = pd.DataFrame({'From': from_list, + 'To': mappings, + 'Similarity': scores}) return matches -@pytest.mark.parametrize("method", ["EditDistance", "TF-IDF"]) +@pytest.mark.parametrize("method", ["EditDistance", "TF-IDF", sentence_model, MyModel()]) def test_base_model(method): model = PolyFuzz(method).match(from_list, to_list) matches = model.get_matches() @@ -36,7 +48,30 @@ def test_base_model(method): assert list(matches.columns) == ['From', 'To', 'Similarity'] -@pytest.mark.parametrize("method", ["EditDistance", "TF-IDF"]) +@pytest.mark.parametrize("method", ["EditDistance", "TF-IDF", sentence_model, MyModel()]) +def test_fit_model(method): + model = PolyFuzz(method).fit(from_list, to_list) + matches = model.get_matches() + + assert isinstance(matches, pd.DataFrame) + assert matches.Similarity.mean() > 0.3 + assert len(matches) == 6 + assert list(matches.columns) == ['From', 'To', 'Similarity'] + + results = model.transform(to_list) + + if method == "TF-IDF": + key = "TF-IDF" + elif method == "EditDistance": + key = "EditDistance" + else: + key = list(results.keys())[0] + + assert isinstance(results[key], pd.DataFrame) + assert results[key].Similarity.sum() > 0 + + +@pytest.mark.parametrize("method", ["EditDistance", "TF-IDF", sentence_model, MyModel()]) def test_grouper(method): model = PolyFuzz(method).match(from_list, to_list) model.group(link_min_similarity=0.75)