From b01e4269061711d2f3e0dd0932c472ebfa6bc608 Mon Sep 17 00:00:00 2001
From: Maarten Grootendorst <maarten_grootendorst@hotmail.com>
Date: Sat, 28 Nov 2020 07:46:30 +0100
Subject: [PATCH] Add tutorial datasets (#4)

---
 docs/tutorial/datasets/datasets.md | 43 ++++++++++++++++++++++++++++++
 mkdocs.yml                         |  1 +
 polyfuzz/__init__.py               |  2 +-
 polyfuzz/datasets/__init__.py      |  6 +++++
 polyfuzz/datasets/_load_data.py    | 40 +++++++++++++++++++++++++++
 setup.py                           |  2 +-
 6 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 docs/tutorial/datasets/datasets.md
 create mode 100644 polyfuzz/datasets/__init__.py
 create mode 100644 polyfuzz/datasets/_load_data.py

diff --git a/docs/tutorial/datasets/datasets.md b/docs/tutorial/datasets/datasets.md
new file mode 100644
index 0000000..7ea228b
--- /dev/null
+++ b/docs/tutorial/datasets/datasets.md
@@ -0,0 +1,43 @@
+# Datasets
+There are two datasets prepared for you to play around with:
+* Company Names
+* Movie Titles
+
+## Movie Titles
+This data is retrieved from:  
+* https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset  
+* https://www.kaggle.com/shivamb/netflix-shows  
+
+It contains Netflix and IMDB movie titles that can be matched against each other. 
+Where IMDB has 80852 movie titles and Netflix has 6172 movie titles.
+
+You can use them as follows:
+
+```python
+from polyfuzz import PolyFuzz
+from polyfuzz.datasets import load_movie_titles
+
+data = load_movie_titles()
+model = PolyFuzz("TF-IDF").match(data["Netflix"], data["IMDB"])
+```
+
+## Company Names
+This data is retrieved from https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv 
+and contains 100_000 company names to be matched against each other. 
+
+This is a different use case than what you have typically seen so far. We often see two different lists compared 
+with each other. Here, you can use this dataset to compare the company names with themselves in order to clean 
+them up. 
+
+You can use them as follows:
+
+```python
+from polyfuzz import PolyFuzz
+from polyfuzz.datasets import load_company_names
+
+data = load_company_names()
+model = PolyFuzz("TF-IDF").match(data, data)
+```
+
+PolyFuzz will recognize that the lists are similar and that you are looking to match the titles with themselves. 
+It will ignore any comparison a string has with itself, otherwise everything will get mapped to itself. 
diff --git a/mkdocs.yml b/mkdocs.yml
index af7473c..74d4ab1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -12,6 +12,7 @@ nav:
     - Models: tutorial/models/models.md
     - Custom Models: tutorial/basematcher/basematcher.md
     - Custom Grouper: tutorial/grouper/grouper.md
+    - Datasets: tutorial/datasets/datasets.md
   - API:
     - PolyFuzz: api/polyfuzz.md
     - Linkage: api/linkage.md
diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py
index 4629de2..777bdd9 100644
--- a/polyfuzz/__init__.py
+++ b/polyfuzz/__init__.py
@@ -1,2 +1,2 @@
 from .polyfuzz import PolyFuzz
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/polyfuzz/datasets/__init__.py b/polyfuzz/datasets/__init__.py
new file mode 100644
index 0000000..a0d343f
--- /dev/null
+++ b/polyfuzz/datasets/__init__.py
@@ -0,0 +1,6 @@
+from ._load_data import load_movie_titles, load_company_names
+
+__all__ = [
+    "load_movie_titles",
+    "load_company_names"
+]
\ No newline at end of file
diff --git a/polyfuzz/datasets/_load_data.py b/polyfuzz/datasets/_load_data.py
new file mode 100644
index 0000000..9c6f560
--- /dev/null
+++ b/polyfuzz/datasets/_load_data.py
@@ -0,0 +1,40 @@
+import json
+import requests
+from typing import List, Mapping
+
+
+def load_movie_titles() -> Mapping[str, List[str]]:
+    """ Load Netflix and IMDB movie titles to be matched against each other
+
+    Retrieved from:
+        https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset
+        https://www.kaggle.com/shivamb/netflix-shows
+
+    Preprocessed such that it only contains the title names where
+    IMDB has 80852 titles and Netflix has 6172
+
+    Returns:
+         data: a dictionary with two keys: "Netflix" and "IMDB" where
+               each value contains a list of movie titles
+    """
+    url = 'https://github.com/MaartenGr/PolyFuzz/raw/master/data/movie_titles.json'
+    resp = requests.get(url)
+    data = json.loads(resp.text)
+    return data
+
+
+def load_company_names() -> List[str]:
+    """ Load company names to be matched against each other.
+
+    Retrieved from:
+        https://www.kaggle.com/dattapiy/sec-edgar-companies-list?select=sec__edgar_company_info.csv
+
+    Preprocessed such that it only contains 100_000 company names.
+
+    Returns:
+        data: a list of company names
+    """
+    url = 'https://github.com/MaartenGr/PolyFuzz/raw/master/data/company_names.json'
+    resp = requests.get(url)
+    data = json.loads(resp.text)
+    return data
diff --git a/setup.py b/setup.py
index 3081755..93cb72f 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
 setup(
     name="polyfuzz",
     packages=find_packages(exclude=["notebooks", "docs"]),
-    version="0.2.0",
+    version="0.2.1",
     author="Maarten Grootendorst",
     author_email="maartengrootendorst@gmail.com",
     description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",