From b01e4269061711d2f3e0dd0932c472ebfa6bc608 Mon Sep 17 00:00:00 2001 From: Maarten Grootendorst Date: Sat, 28 Nov 2020 07:46:30 +0100 Subject: [PATCH] Add tutorial datasets (#4) --- docs/tutorial/datasets/datasets.md | 43 ++++++++++++++++++++++++++++++ mkdocs.yml | 1 + polyfuzz/__init__.py | 2 +- polyfuzz/datasets/__init__.py | 6 +++++ polyfuzz/datasets/_load_data.py | 40 +++++++++++++++++++++++++++ setup.py | 2 +- 6 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 docs/tutorial/datasets/datasets.md create mode 100644 polyfuzz/datasets/__init__.py create mode 100644 polyfuzz/datasets/_load_data.py diff --git a/docs/tutorial/datasets/datasets.md b/docs/tutorial/datasets/datasets.md new file mode 100644 index 0000000..7ea228b --- /dev/null +++ b/docs/tutorial/datasets/datasets.md @@ -0,0 +1,43 @@ +# Datasets +There are two datasets prepared for you to play around with: +* Company Names +* Movie Titles + +## Movie Titles +This data is retrieved from: +* https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset +* https://www.kaggle.com/shivamb/netflix-shows + +It contains Netflix and IMDB movie titles that can be matched against each other. +Where IMDB has 80852 movie titles and Netflix has 6172 movie titles. + +You can use them as follows: + +```python +from polyfuzz import PolyFuzz +from polyfuzz.datasets import load_movie_titles + +data = load_movie_titles() +model = PolyFuzz("TF-IDF").match(data["Netflix"], data["IMDB"]) +``` + +## Company Names +This data is retrieved from https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv +and contains 100_000 company names to be matched against each other. + +This is a different use case than what you have typically seen so far. We often see two different lists compared +with each other. Here, you can use this dataset to compare the company names with themselves in order to clean +them up. + +You can use them as follows: + +```python +from polyfuzz import PolyFuzz +from polyfuzz.datasets import load_company_names + +data = load_company_names() +model = PolyFuzz("TF-IDF").match(data, data) +``` + +PolyFuzz will recognize that the lists are similar and that you are looking to match the titles with themselves. +It will ignore any comparison a string has with itself, otherwise everything will get mapped to itself. diff --git a/mkdocs.yml b/mkdocs.yml index af7473c..74d4ab1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,6 +12,7 @@ nav: - Models: tutorial/models/models.md - Custom Models: tutorial/basematcher/basematcher.md - Custom Grouper: tutorial/grouper/grouper.md + - Datasets: tutorial/datasets/datasets.md - API: - PolyFuzz: api/polyfuzz.md - Linkage: api/linkage.md diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py index 4629de2..777bdd9 100644 --- a/polyfuzz/__init__.py +++ b/polyfuzz/__init__.py @@ -1,2 +1,2 @@ from .polyfuzz import PolyFuzz -__version__ = "0.2.0" +__version__ = "0.2.1" diff --git a/polyfuzz/datasets/__init__.py b/polyfuzz/datasets/__init__.py new file mode 100644 index 0000000..a0d343f --- /dev/null +++ b/polyfuzz/datasets/__init__.py @@ -0,0 +1,6 @@ +from ._load_data import load_movie_titles, load_company_names + +__all__ = [ + "load_movie_titles", + "load_company_names" +] \ No newline at end of file diff --git a/polyfuzz/datasets/_load_data.py b/polyfuzz/datasets/_load_data.py new file mode 100644 index 0000000..9c6f560 --- /dev/null +++ b/polyfuzz/datasets/_load_data.py @@ -0,0 +1,40 @@ +import json +import requests +from typing import List, Mapping + + +def load_movie_titles() -> Mapping[str, List[str]]: + """ Load Netflix and IMDB movie titles to be matched against each other + + Retrieved from: + https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset + https://www.kaggle.com/shivamb/netflix-shows + + Preprocessed such that it only contains the title names where + IMDB has 80852 titles and Netflix has 6172 + + Returns: + data: a dictionary with two keys: "Netflix" and "IMDB" where + each value contains a list of movie titles + """ + url = 'https://github.com/MaartenGr/PolyFuzz/raw/master/data/movie_titles.json' + resp = requests.get(url) + data = json.loads(resp.text) + return data + + +def load_company_names() -> List[str]: + """ Load company names to be matched against each other. + + Retrieved from: + https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv + + Preprocessed such that it only contains 100_000 company names. + + Returns: + data: a list of company names + """ + url = 'https://github.com/MaartenGr/PolyFuzz/raw/master/data/company_names.json' + resp = requests.get(url) + data = json.loads(resp.text) + return data diff --git a/setup.py b/setup.py index 3081755..93cb72f 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup( name="polyfuzz", packages=find_packages(exclude=["notebooks", "docs"]), - version="0.2.0", + version="0.2.1", author="Maarten Grootendorst", author_email="maartengrootendorst@gmail.com", description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",