v0.4.4

SUNCAT-Center · Aug 14, 2018 · 4b927ef · 4b927ef
1 parent cb4da89
commit 4b927ef
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,14 @@
-# dev
+# Version 0.4.4 (August 2018)
 
+-   Major modifications to adsorbates fingerprinter
 -   Bag of site neighbor coordinations numbers implemented.
--   Bag of bonds implemented for adsorbate systems.
--   Bag of bonds implemented.
+-   Bag of connections implemented for adsorbate systems.
+-   General bag of connections implemented.
 -   Data cleaning function now return a dictionary with 'index' of clean features.
+-   New clean function to discard features with excessive skewness.
 -   New adsorbate-chalcogenide fingerprint generator.
--   Improved automatic identification of adsorbate, site.
+-   Enhancements to automatic identification of adsorbate, site.
 -   Generalized coordination number for site.
--   Formal oxidation state fingerprints for chalcogenides.
 -   Formal charges utility.
 -   New sum electronegativity over bonds fingerprinter.
 

diff --git a/catlearn/preprocess/clean_data.py b/catlearn/preprocess/clean_data.py
@@ -2,6 +2,7 @@
 import numpy as np
 from collections import defaultdict
 from sklearn.preprocessing import Imputer
+from scipy.stats import skew
 
 
 def remove_outliers(features, targets, con=1.4826, dev=3., constraint=None):
@@ -176,3 +177,44 @@ def clean_infinite(train, test=None, targets=None, labels=None, mask=None,
     clean['labels'] = labels
 
     return clean
+
+
+def clean_skewness(train, test=None, labels=None, mask=None, skewness=3.):
+    """Discards features that are excessively skewed.
+
+    Parameters
+    ----------
+    train : array
+        Feature matrix for the traing data.
+    test : array
+        Optional feature matrix for the test data. Default is None passed.
+    labels : array
+        Optional list of feature labels. Default is None passed.
+    mask : list
+        Indices of features that are not subject to cleaning.
+    skewness : float
+        Maximum allowed skewness thresshold.
+    """
+    train = np.asarray(train, dtype=np.float64)
+
+    clean = defaultdict(list)
+
+    data_skewness = skew(train, axis=0)
+    print(data_skewness)
+    assert np.isfinite(data_skewness).all()
+
+    # Index of informative features.
+    index = list(np.where(abs(data_skewness) < skewness)[0])
+    print(index)
+    clean['index'] = index
+
+    # Clean data.
+    clean['train'] = train[:, index].copy()
+    if test is not None:
+        test = np.asarray(test, dtype=np.float64)
+        clean['test'] = test[:, index].copy()
+    if labels is not None:
+        labels = np.asarray(labels)
+        clean['labels'] = labels[index].copy()
+
+    return clean
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,6 +1,33 @@
 # Changelog
 
-## Version 0.4.1 (April 2018)
+# Version 0.4.4 (August 2018)
+
+-   Major modifications to adsorbates fingerprinter
+-   Bag of site neighbor coordinations numbers implemented.
+-   Bag of connections implemented for adsorbate systems.
+-   General bag of connections implemented.
+-   Data cleaning function now return a dictionary with 'index' of clean features.
+-   New clean function to discard features with excessive skewness.
+-   New adsorbate-chalcogenide fingerprint generator.
+-   Enhancements to automatic identification of adsorbate, site.
+-   Generalized coordination number for site.
+-   Formal charges utility.
+-   New sum electronegativity over bonds fingerprinter.
+
+# Version 0.4.3 (May 2018)
+
+-   `ConvolutedFingerprintGenerator` added for bulk and molecules.
+-   Dropped support for Python3.4 as it appears to start causing problems.
+
+# Version 0.4.2 (May 2018)
+
+-   Genetic algorithm feature selection can parallelize over population within each generation.
+-   Default fingerprinter function sets accessible using `catlearn.fingerprint.setup.default_fingerprinters`
+-   New surrogate model utility
+-   New utility for evaluating cutoff radii for connectivity based fingerprinting.
+-   `default_catlearn_radius` improved.
+
+# Version 0.4.1 (April 2018)
 
 -   AtoML renamed to CatLearn and moved to Github.
 -   Adsorbate fingerprinting again parallelizable.
@@ -10,7 +37,7 @@
 -   Fixed a bug that caused the negative log marginal likelihood to be attached to the gp class.
 -   Small speed improvement for initialize and updates to `GaussianProcess`.
 
-## Version 0.4.0 (April 2018)
+# Version 0.4.0 (April 2018)
 
 -   Added `autogen_info` function for list of atoms objects representing adsorbates.
     -   This can auto-generate all atomic group information and attach it to `atoms.info`.
@@ -32,14 +59,14 @@
 -   Modified uncertainty output. The user can ask for the uncertainty with and without adding noise parameter (regularization).
 -   Clean up some bits of code, fix some bugs.
 
-## Version 0.3.1 (February 2018)
+# Version 0.3.1 (February 2018)
 
 -   Added a parallel version of the greedy feature selection. **Python3 only!**
 -   Updated the k-fold cross-validation function to handle features and targets explicitly.
 -   Added some basic read/write functionality to the k-fold CV.
 -   A number of minor bugs have been fixed.
 
-## Version 0.3.0 (February 2018)
+# Version 0.3.0 (February 2018)
 
 -   Update the fingerprint generator functions so there is now a `FeatureGenerator` class that wraps round all type specific generators.
 -   Feature generation can now be performed in parallel, setting `nprocs` variable in the `FeatureGenerator` class. **Python3 only!**
@@ -50,12 +77,12 @@
 -   Added some more test and changed the way test are called/handled.
 -   A number of minor bugs have been fixed.
 
-## Version 0.2.1 (February 2018)
+# Version 0.2.1 (February 2018)
 
 -   Update functions to compile features allowing for variable length of atoms objects.
 -   Added some tutorials for hierarchy cross-validation and prediction on organic molecules.
 
-## Version 0.2.0 (January 2018)
+# Version 0.2.0 (January 2018)
 
 -   Gradients added to hyperparameter optimization.
 -   More features added to the adsorbate fingerprint generator.
@@ -67,7 +94,7 @@
 -   Added Dockerfile and appropriate documentation in the README and CONTRIBUTING guidelines.
 -   A number of minor bugs have been fixed.
 
-## Version 0.1.0 (December 2017)
+# Version 0.1.0 (December 2017)
 
 -   The first stable version of the code base!
 -   For those that used the precious development version, there are many big changes in the way the code is structured. Most scripts will need to be rewritten.

diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def parse_requirements(filename):
 
 setuptools.setup(
     name="CatLearn",
-    version="0.4.4.dev5",
+    version="0.4.4",
     url="https://github.com/SUNCAT-Center/CatLearn",
 
     author="Paul C. Jennings",

diff --git a/test/test_data_clean.py b/test/test_data_clean.py
@@ -52,6 +52,19 @@ def test_inf(self):
         self.assertTrue(np.shape(finite['train']) == (50, 4))
         self.assertTrue(np.shape(finite['test']) == (100, 4))
 
+    def test_skew(self):
+        """Test cleaning inf variable features."""
+        features = np.random.random_sample((50, 5))
+        features[:, 0] = 0
+        features[0, 0] = 1
+        test = np.random.random_sample((100, 5))
+        labels = 5 * ['test_label']
+
+        symmetric = clean.clean_skewness(features, test=test, labels=labels)
+
+        self.assertTrue(np.shape(symmetric['train']) == (50, 4))
+        self.assertTrue(np.shape(symmetric['test']) == (100, 4))
+
     def test_general(self):
         """Test the general cleaning/scaling function."""
         train_features, train_targets, test_features, _ = get_data()

diff --git a/tutorials/02_data_setup/particle_data_setup.ipynb b/tutorials/02_data_setup/particle_data_setup.ipynb
@@ -47,6 +47,7 @@
     "from ase.io import write\n",
     "\n",
     "from catlearn.api.ase_data_setup import get_unique, get_train\n",
+    "from catlearn.api.ase_atoms_api import images_connectivity\n",
     "from catlearn.fingerprint.setup import FeatureGenerator, default_fingerprinters\n",
     "from catlearn.utilities import DescriptorDatabase"
    ]
@@ -105,9 +106,10 @@
    "outputs": [],
    "source": [
     "testset = get_unique(atoms=all_cand, size=10, key='raw_score')\n",
-    "\n",
+    "images_connectivity(testset['atoms']);\n",
     "trainset = get_train(atoms=all_cand, size=30, taken=testset['taken'],\n",
-    "                     key='raw_score')"
+    "                     key='raw_score')\n",
+    "images_connectivity(trainset['atoms']);"
    ]
   },
   {