diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f67953e..462f4de5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,14 @@ -# dev +# Version 0.4.4 (August 2018) +- Major modifications to adsorbates fingerprinter - Bag of site neighbor coordinations numbers implemented. -- Bag of bonds implemented for adsorbate systems. -- Bag of bonds implemented. +- Bag of connections implemented for adsorbate systems. +- General bag of connections implemented. - Data cleaning function now return a dictionary with 'index' of clean features. +- New clean function to discard features with excessive skewness. - New adsorbate-chalcogenide fingerprint generator. -- Improved automatic identification of adsorbate, site. +- Enhancements to automatic identification of adsorbate, site. - Generalized coordination number for site. -- Formal oxidation state fingerprints for chalcogenides. - Formal charges utility. - New sum electronegativity over bonds fingerprinter. diff --git a/catlearn/preprocess/clean_data.py b/catlearn/preprocess/clean_data.py index 17ae5d22..fae148f8 100644 --- a/catlearn/preprocess/clean_data.py +++ b/catlearn/preprocess/clean_data.py @@ -2,6 +2,7 @@ import numpy as np from collections import defaultdict from sklearn.preprocessing import Imputer +from scipy.stats import skew def remove_outliers(features, targets, con=1.4826, dev=3., constraint=None): @@ -176,3 +177,44 @@ def clean_infinite(train, test=None, targets=None, labels=None, mask=None, clean['labels'] = labels return clean + + +def clean_skewness(train, test=None, labels=None, mask=None, skewness=3.): + """Discards features that are excessively skewed. + + Parameters + ---------- + train : array + Feature matrix for the traing data. + test : array + Optional feature matrix for the test data. Default is None passed. + labels : array + Optional list of feature labels. Default is None passed. + mask : list + Indices of features that are not subject to cleaning. + skewness : float + Maximum allowed skewness thresshold. + """ + train = np.asarray(train, dtype=np.float64) + + clean = defaultdict(list) + + data_skewness = skew(train, axis=0) + print(data_skewness) + assert np.isfinite(data_skewness).all() + + # Index of informative features. + index = list(np.where(abs(data_skewness) < skewness)[0]) + print(index) + clean['index'] = index + + # Clean data. + clean['train'] = train[:, index].copy() + if test is not None: + test = np.asarray(test, dtype=np.float64) + clean['test'] = test[:, index].copy() + if labels is not None: + labels = np.asarray(labels) + clean['labels'] = labels[index].copy() + + return clean diff --git a/docs/changelog.md b/docs/changelog.md index 0afc44fc..dc8f99ea 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,33 @@ # Changelog -## Version 0.4.1 (April 2018) +# Version 0.4.4 (August 2018) + +- Major modifications to adsorbates fingerprinter +- Bag of site neighbor coordinations numbers implemented. +- Bag of connections implemented for adsorbate systems. +- General bag of connections implemented. +- Data cleaning function now return a dictionary with 'index' of clean features. +- New clean function to discard features with excessive skewness. +- New adsorbate-chalcogenide fingerprint generator. +- Enhancements to automatic identification of adsorbate, site. +- Generalized coordination number for site. +- Formal charges utility. +- New sum electronegativity over bonds fingerprinter. + +# Version 0.4.3 (May 2018) + +- `ConvolutedFingerprintGenerator` added for bulk and molecules. +- Dropped support for Python3.4 as it appears to start causing problems. + +# Version 0.4.2 (May 2018) + +- Genetic algorithm feature selection can parallelize over population within each generation. +- Default fingerprinter function sets accessible using `catlearn.fingerprint.setup.default_fingerprinters` +- New surrogate model utility +- New utility for evaluating cutoff radii for connectivity based fingerprinting. +- `default_catlearn_radius` improved. + +# Version 0.4.1 (April 2018) - AtoML renamed to CatLearn and moved to Github. - Adsorbate fingerprinting again parallelizable. @@ -10,7 +37,7 @@ - Fixed a bug that caused the negative log marginal likelihood to be attached to the gp class. - Small speed improvement for initialize and updates to `GaussianProcess`. -## Version 0.4.0 (April 2018) +# Version 0.4.0 (April 2018) - Added `autogen_info` function for list of atoms objects representing adsorbates. - This can auto-generate all atomic group information and attach it to `atoms.info`. @@ -32,14 +59,14 @@ - Modified uncertainty output. The user can ask for the uncertainty with and without adding noise parameter (regularization). - Clean up some bits of code, fix some bugs. -## Version 0.3.1 (February 2018) +# Version 0.3.1 (February 2018) - Added a parallel version of the greedy feature selection. **Python3 only!** - Updated the k-fold cross-validation function to handle features and targets explicitly. - Added some basic read/write functionality to the k-fold CV. - A number of minor bugs have been fixed. -## Version 0.3.0 (February 2018) +# Version 0.3.0 (February 2018) - Update the fingerprint generator functions so there is now a `FeatureGenerator` class that wraps round all type specific generators. - Feature generation can now be performed in parallel, setting `nprocs` variable in the `FeatureGenerator` class. **Python3 only!** @@ -50,12 +77,12 @@ - Added some more test and changed the way test are called/handled. - A number of minor bugs have been fixed. -## Version 0.2.1 (February 2018) +# Version 0.2.1 (February 2018) - Update functions to compile features allowing for variable length of atoms objects. - Added some tutorials for hierarchy cross-validation and prediction on organic molecules. -## Version 0.2.0 (January 2018) +# Version 0.2.0 (January 2018) - Gradients added to hyperparameter optimization. - More features added to the adsorbate fingerprint generator. @@ -67,7 +94,7 @@ - Added Dockerfile and appropriate documentation in the README and CONTRIBUTING guidelines. - A number of minor bugs have been fixed. -## Version 0.1.0 (December 2017) +# Version 0.1.0 (December 2017) - The first stable version of the code base! - For those that used the precious development version, there are many big changes in the way the code is structured. Most scripts will need to be rewritten. diff --git a/setup.py b/setup.py index ca9b99cc..f3a5be43 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def parse_requirements(filename): setuptools.setup( name="CatLearn", - version="0.4.4.dev5", + version="0.4.4", url="https://github.com/SUNCAT-Center/CatLearn", author="Paul C. Jennings", diff --git a/test/test_data_clean.py b/test/test_data_clean.py index d15e5b1c..bfb0f7d2 100644 --- a/test/test_data_clean.py +++ b/test/test_data_clean.py @@ -52,6 +52,19 @@ def test_inf(self): self.assertTrue(np.shape(finite['train']) == (50, 4)) self.assertTrue(np.shape(finite['test']) == (100, 4)) + def test_skew(self): + """Test cleaning inf variable features.""" + features = np.random.random_sample((50, 5)) + features[:, 0] = 0 + features[0, 0] = 1 + test = np.random.random_sample((100, 5)) + labels = 5 * ['test_label'] + + symmetric = clean.clean_skewness(features, test=test, labels=labels) + + self.assertTrue(np.shape(symmetric['train']) == (50, 4)) + self.assertTrue(np.shape(symmetric['test']) == (100, 4)) + def test_general(self): """Test the general cleaning/scaling function.""" train_features, train_targets, test_features, _ = get_data() diff --git a/tutorials/02_data_setup/particle_data_setup.ipynb b/tutorials/02_data_setup/particle_data_setup.ipynb index 88e354c3..9e02bb9e 100644 --- a/tutorials/02_data_setup/particle_data_setup.ipynb +++ b/tutorials/02_data_setup/particle_data_setup.ipynb @@ -47,6 +47,7 @@ "from ase.io import write\n", "\n", "from catlearn.api.ase_data_setup import get_unique, get_train\n", + "from catlearn.api.ase_atoms_api import images_connectivity\n", "from catlearn.fingerprint.setup import FeatureGenerator, default_fingerprinters\n", "from catlearn.utilities import DescriptorDatabase" ] @@ -105,9 +106,10 @@ "outputs": [], "source": [ "testset = get_unique(atoms=all_cand, size=10, key='raw_score')\n", - "\n", + "images_connectivity(testset['atoms']);\n", "trainset = get_train(atoms=all_cand, size=30, taken=testset['taken'],\n", - " key='raw_score')" + " key='raw_score')\n", + "images_connectivity(trainset['atoms']);" ] }, {