psolin · petri · Apr 18, 2020 · Apr 19, 2020 · Apr 19, 2020 · Apr 19, 2020
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,7 @@ docs/_build/
 
 # PyBuilder
 target/
+
+# MacOS
+
+.DS_Store
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,6 +1,15 @@
 Changelog
 **********
 
+2.0 (2020-04-18)
+----------------
+
+- Major refactoring & cleanup (e.g. #16)
+- Optimizations
+- new APIs
+- Python3 only (#46)
+- Better Unicode matching (#45)
+
 1.3 (9.9. 2015)
 ----------------
 

diff --git a/README.md b/README.md
@@ -17,36 +17,39 @@ countries.
 ## How do I install it?
 Just use 'pip install cleanco' if you have pip installed (as most systems do). Or download the zip distribution from this site, unzip it and then:
 
-* Mac: `cd` into it, and enter `sudo python setup.py install` along with your system password.
-* Windows: Same thing but without `sudo`.
+* Mac: `cd` into it, and enter `sudo python3 setup.py install` along with your system password.
+* Windows: `python setup.py install`.
 
 ## How does it work?
-Let's look at some sample code.  First, create an instance of the module:
+If you only want a clean version of the company name, first pull in the terms:
 
-    >>> from cleanco import cleanco
+    >>> terms = get_terms()
 
-Prepare a string of a company name that you want to process:
+Then, run the string and the terms through the "basename" function:
 
-    >>> business_name = "Some Big Pharma, LLC"
+    >>> basename("Daddy & Sons, Ltd.", terms)
+    Daddy & Sons
 
-Throw it into the instance:
+If you want to classify the name by business entity type, first select it as a source:
 
-    >>> x = cleanco(business_name)
+    >>> classification_sources = typesources()
 
-You can now get the company types:
+Then, run the string and classication source through the "matches" function:
 
-    >>> x.type()
-    ['Limited Liability Company']
+    >>> matches("MyCompany Ltd", classification_sources)
+    ['Limited']
 
-...the possible countries...
+If you want to classify the name by possible countries, first select it as a source:
 
-    >>> x.country()
-    ['United States of America', 'Philippines']
+    >>> classification_sources = countrysources()
+
+Then, run the string and classication source through the "matches" function:
 
-...and a clean version of the company name.
+    >>> matches("MyCompany Ltd", classification_sources)
+    ['United States of America', 'Philippines']
 
-    >>> x.clean_name()
-    'Some Big Pharma'
+## Compatibility with previous versions
+cleanco's API was simplified in version 2.0. While previous functions are still compatible, they are not preferred.
 
 ## Are there bugs?
 See the issue tracker. If you find a bug or have enhancement suggestion or question, please file an issue and provide a PR if you can. For example, some of the company suffixes may be incorrect or there may be suffixes missing.
@@ -55,5 +58,5 @@ To run tests, simply install the package and run `python setup.py test`. To run
 
 ## Special thanks to:
 
-- Wikipedia's [Types of Business Entity article](http://en.wikipedia.org/wiki/Types_of_business_entity), where I spent hours of research.
+- Wikipedia's [Types of Business Entity article](http://en.wikipedia.org/wiki/Types_of_business_entity).
 - Contributors: Petri Savolainen <[email protected]>
diff --git a/cleanco.py b/cleanco.py
diff --git a/cleanco/__init__.py b/cleanco/__init__.py
@@ -0,0 +1 @@
+from .cleanco import cleanco
diff --git a/cleanco/classify.py b/cleanco/classify.py
@@ -0,0 +1,60 @@
+"""
+Functions to help classify business names by country or type, based on legal terms.
+
+Examples of use:
+
+>> # check name for its possible business type(s)
+>> classification_sources = typesources()
+>> matches("MyCompany Ltd", classification_sources)
+['Limited']
+>>
+
+>> # check name for its possible jurisdictions, usually countries
+>> classification_sources = countrysources()
+>> matches("MyCompany Ltd", classification_sources)
+['New Zealand', 'United Kingdom', 'United States of America']
+>>
+
+"""
+
+from termdata import terms_by_country, terms_by_type
+from clean import strip_tail, normalized
+
+
+def typesources():
+   "business types / abbreviations sorted by length of business type"
+   types = []
+   for business_type in terms_by_type:
+       for item in terms_by_type[business_type]:
+           types.append((business_type, item))
+
+   return sorted(types, key=lambda part: len(part[1]), reverse=True)
+
+def countrysources():
+   "business countries / type abbreviations sorted by length of type abbreviations"
+   countries = []
+   for country in terms_by_country:
+       for item in terms_by_country[country]:
+           countries.append((country, item))
+
+   return sorted(countries, key=lambda part: len(part[1]), reverse=True)
+
+def matches(name, sources):
+    "get types or countries matching with the legal terms in name"
+
+    name = strip_tail(name)
+    parts = name.split()
+    nparts = [normalized(p) for p in parts]
+    matches = []
+
+    for classifier, term in sources:
+        nterm = normalized(term)
+        try:
+            idx = nparts.index(nterm)
+        except ValueError:
+            pass
+        else:
+            matches.append(classifier)
+
+    return matches
+
diff --git a/cleanco/clean.py b/cleanco/clean.py
@@ -0,0 +1,75 @@
+"""Functions to help clean & normalize business names.
+
+See http://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details
+on Unicode normalization and the NFKD normalization used here.
+
+Basic usage:
+
+>> terms = get_terms()
+>> clean_name("Daddy & Sons, Ltd.", terms)
+Daddy & Sons
+
+"""
+
+import functools
+import operator
+from collections import OrderedDict
+import re
+import unicodedata
+from termdata import terms_by_type, terms_by_country
+
+
+tail_removal_rexp = re.compile(r"[^\.\w]+$", flags=re.UNICODE)
+
+
+def get_terms():
+    "retrieve all unique terms from termdata definitions"
+    ts = functools.reduce(operator.iconcat, terms_by_type.values(), [])
+    cs = functools.reduce(operator.iconcat, terms_by_country.values(), [])
+    return set(ts + cs)
+
+
+def strip_tail(name):
+    "Get rid of all trailing non-letter symbols except the dot"
+    match = re.search(tail_removal_rexp, name)
+    if match is not None:
+        name = name[: match.span()[0]]
+    return name
+
+
+def normalized(text):
+    "caseless Unicode normalization"
+    return unicodedata.normalize("NFKD", text.casefold())
+
+
+def basename(name, terms, suffix=True, prefix=False, middle=False, multi=False):
+    "return cleaned base version of the business name"
+
+    name = strip_tail(name)
+    parts = name.split()
+    nparts = [normalized(p) for p in parts]
+
+    # return name without suffixed/prefixed/middle type term(s)
+    for term in (normalized(t) for t in terms):
+        if suffix and nparts[-1] == term:
+            del nparts[-1]
+            del parts[-1]
+            if multi == False:
+                break
+        if prefix and nparts[0] == term:
+            del nparts[0]
+            del parts[0]
+            if multi == False:
+                break
+        if middle:
+            try:
+                idx = nparts.index(term)
+            except ValueError:
+                pass
+            else:
+                del nparts[idx]
+                del parts[idx]
+            if multi == False:
+                break
+
+    return strip_tail(" ".join(parts))
diff --git a/cleanco/cleanco.py b/cleanco/cleanco.py
@@ -0,0 +1,20 @@
+from clean import get_terms, basename
+from classify import typesources, countrysources
+
+
+class cleanco:
+   "silly backwards compatibility wrapper, you should NOT use this"
+
+   def __init__(self):
+      self._types = typesources()
+      self._countries = countrysources()
+      self._terms = get_terms()
+
+   def clean_name(self, name):
+      return basename(name, self._terms)
+
+   def country(self, name):
+      return matches(name, self._countries)
+
+   def type(self, name):
+      return matches(name, self._types)
-Original file line number
+Diff line change
@@ Expand Up / @@ -55,3 +55,7 @@ docs/_build/ @@
     # PyBuilder
     target/
+    # MacOS
+    .DS_Store