Add tutorial datasets (#4)

MaartenGr · Nov 28, 2020 · b01e426 · b01e426
1 parent 1cd46f4
commit b01e426
Show file tree

Hide file tree

Showing 6 changed files with 92 additions and 2 deletions.
diff --git a/docs/tutorial/datasets/datasets.md b/docs/tutorial/datasets/datasets.md
@@ -0,0 +1,43 @@
+# Datasets
+There are two datasets prepared for you to play around with:
+* Company Names
+* Movie Titles
+
+## Movie Titles
+This data is retrieved from:  
+* https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset  
+* https://www.kaggle.com/shivamb/netflix-shows  
+
+It contains Netflix and IMDB movie titles that can be matched against each other. 
+Where IMDB has 80852 movie titles and Netflix has 6172 movie titles.
+
+You can use them as follows:
+
+```python
+from polyfuzz import PolyFuzz
+from polyfuzz.datasets import load_movie_titles
+
+data = load_movie_titles()
+model = PolyFuzz("TF-IDF").match(data["Netflix"], data["IMDB"])
+```
+
+## Company Names
+This data is retrieved from https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv 
+and contains 100_000 company names to be matched against each other. 
+
+This is a different use case than what you have typically seen so far. We often see two different lists compared 
+with each other. Here, you can use this dataset to compare the company names with themselves in order to clean 
+them up. 
+
+You can use them as follows:
+
+```python
+from polyfuzz import PolyFuzz
+from polyfuzz.datasets import load_company_names
+
+data = load_company_names()
+model = PolyFuzz("TF-IDF").match(data, data)
+```
+
+PolyFuzz will recognize that the lists are similar and that you are looking to match the titles with themselves. 
+It will ignore any comparison a string has with itself, otherwise everything will get mapped to itself. 
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -12,6 +12,7 @@ nav:
     - Models: tutorial/models/models.md
     - Custom Models: tutorial/basematcher/basematcher.md
     - Custom Grouper: tutorial/grouper/grouper.md
+    - Datasets: tutorial/datasets/datasets.md
   - API:
     - PolyFuzz: api/polyfuzz.md
     - Linkage: api/linkage.md

diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py
@@ -1,2 +1,2 @@
 from .polyfuzz import PolyFuzz
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/polyfuzz/datasets/__init__.py b/polyfuzz/datasets/__init__.py
@@ -0,0 +1,6 @@
+from ._load_data import load_movie_titles, load_company_names
+
+__all__ = [
+    "load_movie_titles",
+    "load_company_names"
+]
diff --git a/polyfuzz/datasets/_load_data.py b/polyfuzz/datasets/_load_data.py
@@ -0,0 +1,40 @@
+import json
+import requests
+from typing import List, Mapping
+
+
+def load_movie_titles() -> Mapping[str, List[str]]:
+    """ Load Netflix and IMDB movie titles to be matched against each other
+
+    Retrieved from:
+        https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset
+        https://www.kaggle.com/shivamb/netflix-shows
+
+    Preprocessed such that it only contains the title names where
+    IMDB has 80852 titles and Netflix has 6172
+
+    Returns:
+         data: a dictionary with two keys: "Netflix" and "IMDB" where
+               each value contains a list of movie titles
+    """
+    url = 'https://github.com/MaartenGr/PolyFuzz/raw/master/data/movie_titles.json'
+    resp = requests.get(url)
+    data = json.loads(resp.text)
+    return data
+
+
+def load_company_names() -> List[str]:
+    """ Load company names to be matched against each other.
+
+    Retrieved from:
+        https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv
+
+    Preprocessed such that it only contains 100_000 company names.
+
+    Returns:
+        data: a list of company names
+    """
+    url = 'https://github.com/MaartenGr/PolyFuzz/raw/master/data/company_names.json'
+    resp = requests.get(url)
+    data = json.loads(resp.text)
+    return data
diff --git a/setup.py b/setup.py
@@ -37,7 +37,7 @@
 setup(
     name="polyfuzz",
     packages=find_packages(exclude=["notebooks", "docs"]),
-    version="0.2.0",
+    version="0.2.1",
     author="Maarten Grootendorst",
     author_email="[email protected]",
     description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",