Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Feature Selection #47

Merged
merged 13 commits into from
Mar 27, 2024
1 change: 1 addition & 0 deletions examples/sample-selection/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- metatensor
- rascaline @ git+https://github.com/Luthaf/rascaline@ca957642f512e141c7570e987aadc05c7ac71983
- skmatter
- equisolve @ git+https://github.com/lab-cosmo/equisolve.git@c858bedef4b2799eb445e4c92535ee387224089a
229 changes: 181 additions & 48 deletions examples/sample-selection/sample-selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@

import ase.io
import chemiscope
import metatensor
import numpy as np
from equisolve.numpy import feature_selection, sample_selection
from matplotlib import pyplot as plt
from metatensor import mean_over_samples
from metatensor import sum_over_samples
from rascaline import SoapPowerSpectrum
from sklearn.decomposition import PCA
from skmatter import feature_selection, sample_selection
from skmatter import feature_selection as skfeat_selection


# %%
Expand Down Expand Up @@ -57,54 +59,151 @@
# Generate a SOAP power spectrum
calculator = SoapPowerSpectrum(**hypers)
rho2i = calculator.compute(frames)


# Makes a dense block
rho2i = rho2i.keys_to_samples(["species_center"]).keys_to_properties(
["species_neighbor_1", "species_neighbor_2"]
atom_soap = rho2i.keys_to_properties(["species_neighbor_1", "species_neighbor_2"])

atom_soap_single_block = atom_soap.keys_to_samples(keys_to_move=["species_center"])

# print(atom_soap_single_block)
# print(atom_soap_single_block.block(0)) # There is only one block now!

# Sum over atomic centers to compute structure features
struct_soap = sum_over_samples(
atom_soap_single_block, sample_names=["center", "species_center"]
)
# Averages over atomic centers to compute structure features
rho2i_structure = mean_over_samples(rho2i, sample_names=["center", "species_center"])

atom_dscrptr = rho2i.block(0).values
struct_dscrptr = rho2i_structure.block(0).values

print("atom feature descriptor shape:", atom_dscrptr.shape)
print("structure feature descriptor shape:", struct_dscrptr.shape)
print("atom feature descriptor shape:", atom_soap.block(0).values.shape)
print(
"atom feature descriptor (all in one block) shape:",
atom_soap_single_block.block(0).values.shape,
)
print("structure feature descriptor shape:", struct_soap.block(0).values.shape)


# %%
# Perform structure (i.e. sample) selection
# -----------------------------------------
# Perform atomic environment (i.e. sample) selection
# ---------------------------------------------------
#
# Using FPS and CUR algorithms implemented in scikit-matter, select a subset of
# the structures. skmatter assumes that our descriptor is represented as a 2D
# matrix, with the samples along axis 0 and features along axis 1.
# Using FPS and CUR algorithms, we can perform selection of atomic environments.
# These are implemented in equisolve, which provides a wrapper around
# scikit-matter to allow for interfacing with data stored in the metatensor
# format.
#
# For more info on the functions: `skmatter
# <https://scikit-cosmo.readthedocs.io/en/latest/selection.html>`_
# Suppose we want to select the 10 most diverse environments for each chemical
# species.
#
# First, we can use the `keys_to_properties` operation in metatensor to move the
# neighbour species indices to the properties of the TensorBlocks. The resulting
# descriptor will be a TensorMap comprised of three blocks, one for each
# chemical species, where the chemical species indices are solely present in the
# keys.


print("----Atomic environment selection-----")
# Define the number of structures to select using FPS/CUR
n_structures = 25
n_envs = 25

print(atom_soap)
print(atom_soap.block(0))

# %% Now let's perform sample selection on the atomic environments. We want to
# select 10 atomic environments for each chemical species.

# Define the number of structures *per block* to select using FPS
n_envs = 10

# FPS sample selection
struct_fps = sample_selection.FPS(n_to_select=n_structures, initialize="random").fit(
struct_dscrptr
selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit(
atom_soap
)
struct_fps_idxs = struct_fps.selected_idx_

# CUR sample selection
struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_dscrptr)
struct_cur_idxs = struct_cur.selected_idx_
# Print the selected envs for each block
print("atomic envs selected with FPS:\n")
for key, block in selector_atomic_fps.support.items():
print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values)

selector_atomic_cur = sample_selection.CUR(n_to_select=n_envs).fit(atom_soap)
# Print the selected envs for each block
print("atomic envs selected with CUR:\n")
for key, block in selector_atomic_cur.support.items():
print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values)


# %%
# Selecting from a combined pool of atomic environments
# -----------------------------------------------------
#
# One can also select from a combined pool of atomic environments and
# structures, instead of selecting an equal number of atomic environments for
# each chemical species. In this case, we can move the 'species_center' key to samples
# such that our descriptor is a TensorMap consisting of a single block. Upon
# sample selection, the most diverse atomic environments will be selected,
# regardless of their chemical species.
print("----All atomic environment selection-----")

print("keys", atom_soap.keys)
print("blocks", atom_soap[0])
print("samples in first block", atom_soap[0].samples)

# Using the original SOAP descriptor, move all keys to properties.


# Define the number of structures to select using FPS
n_envs = 10

# FPS sample selection
selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit(
atom_soap_single_block
)
print(
"atomic envs selected with FPS: \n (struct_idx, atom_idx, species_center) \n",
selector_atomic_fps.support.block(0).samples.values,
)


# %%
# Perform structure (i.e. sample) selection with FPS/CUR
# ---------------------------------------------------------
#
# Instead of atomic environments, one can also select diverse structures. We can
# use the `sum_over_samples` operation in metatensor to define features in the
# structural basis instead of the atomic basis. This is done by summing over the
# atomic environments, labeled by the 'center' index in the samples of the
# TensorMap.
#
# Alternatively, one could use the `mean_over_samples` operation, depending on
# the specific inhomogeneity of the size of the structures in the training set.

print("----Structure selection-----")

# Define the number of structures to select *per block* using FPS
n_structures = 10

# FPS structure selection
selector_struct_fps = sample_selection.FPS(
n_to_select=n_structures, initialize="random"
).fit(struct_soap)
struct_fps_idxs = selector_struct_fps.support.block(0).samples.values.flatten()

print("structures selected with FPS:\n", struct_fps_idxs)

# CUR structure selection
selector_struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_soap)
struct_cur_idxs = selector_struct_cur.support.block(0).samples.values.flatten()
print("structures selected with CUR:\n", struct_cur_idxs)

print("Structure indices obtained with FPS ", struct_fps_idxs)
print("Structure indices obtained with CUR ", struct_cur_idxs)

# Slice structure descriptor along axis 0 to contain only the selected structures
struct_dscrptr_fps = struct_dscrptr[struct_fps_idxs, :]
struct_dscrptr_cur = struct_dscrptr[struct_cur_idxs, :]
assert struct_dscrptr_fps.shape == struct_dscrptr_cur.shape
struct_soap_fps = struct_soap.block(0).values[struct_fps_idxs, :]
struct_soap_cur = struct_soap.block(0).values[struct_cur_idxs, :]
assert struct_soap_fps.shape == struct_soap_cur.shape

print("Structure descriptor shape before selection ", struct_dscrptr.shape)
print("Structure descriptor shape after selection ", struct_dscrptr_fps.shape)
print("Structure descriptor shape before selection ", struct_soap.block(0).values.shape)
print("Structure descriptor shape after selection (FPS)", struct_soap_fps.shape)
print("Structure descriptor shape after selection (CUR)", struct_soap_cur.shape)


# %%
Expand All @@ -120,8 +219,8 @@


# Generate a structure PCA
struct_dscrptr_pca = PCA(n_components=2).fit_transform(struct_dscrptr)
assert struct_dscrptr_pca.shape == (n_frames, 2)
struct_soap_pca = PCA(n_components=2).fit_transform(struct_soap.block(0).values)
assert struct_soap_pca.shape == (n_frames, 2)


# %%
Expand All @@ -133,16 +232,10 @@

# Matplotlib plot
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
scatter = ax.scatter(struct_dscrptr_pca[:, 0], struct_dscrptr_pca[:, 1], c="red")
ax.plot(
struct_dscrptr_pca[struct_cur_idxs, 0],
struct_dscrptr_pca[struct_cur_idxs, 1],
"kx",
label="CUR selection",
)
scatter = ax.scatter(struct_soap_pca[:, 0], struct_soap_pca[:, 1], c="red")
ax.plot(
struct_dscrptr_pca[struct_fps_idxs, 0],
struct_dscrptr_pca[struct_fps_idxs, 1],
struct_soap_pca[struct_cur_idxs, 0],
struct_soap_pca[struct_cur_idxs, 1],
"ko",
fillstyle="none",
label="FPS selection",
Expand Down Expand Up @@ -181,13 +274,12 @@

properties.update(
{
"PC1": struct_dscrptr_pca[:, 0],
"PC2": struct_dscrptr_pca[:, 1],
"PC1": struct_soap_pca[:, 0],
"PC2": struct_soap_pca[:, 1],
"selection": np.array(selection_levels),
}
)

print(properties)

# Display with chemiscope. This currently does not work - as raised in issue #8
# https://github.com/lab-cosmo/software-cookbook/issues/8
Expand Down Expand Up @@ -221,22 +313,63 @@
# Now perform feature selection. In this example we will go back to using the
# descriptor decomposed into atomic environments, as opposed to the one
# decomposed into structure environments, but only use FPS for brevity.
print("----Feature selection-----")

# Define the number of features to select
n_features = 200

# FPS feature selection
feat_fps = feature_selection.FPS(n_to_select=n_features, initialize="random").fit(
atom_dscrptr
atom_soap_single_block
)

# Slice atomic descriptor along axis 1 to contain only the selected features
# atom_soap_single_block_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs]
atom_soap_single_block_fps = metatensor.slice(
atom_soap_single_block,
axis="properties",
labels=feat_fps.support.block(0).properties,
)

print(
"atomic descriptor shape before selection ",
atom_soap_single_block.block(0).values.shape,
)
print(
"atomic descriptor shape after selection ",
atom_soap_single_block_fps.block(0).values.shape,
)

# %%

# %%
# Perform feature selection (skmatter)
# ------------------------------------
#
# Now perform feature selection. In this example we will go back to using the
# descriptor decomposed into atomic environments, as opposed to the one
# decomposed into structure environments, but only use FPS for brevity.

print("----Feature selection (skmatter)-----")

# Define the number of features to select
n_features = 200

# FPS feature selection
feat_fps = skfeat_selection.FPS(n_to_select=n_features, initialize="random").fit(
atom_soap_single_block.block(0).values
)
feat_fps_idxs = feat_fps.selected_idx_

print("Feature indices obtained with FPS ", feat_fps_idxs)

# Slice atomic descriptor along axis 1 to contain only the selected features
atom_dscrptr_fps = atom_dscrptr[:, feat_fps_idxs]
atom_dscrptr_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs]

print("atomic descriptor shape before selection ", atom_dscrptr.shape)
print(
"atomic descriptor shape before selection ",
atom_soap_single_block.block(0).values.shape,
)
print("atomic descriptor shape after selection ", atom_dscrptr_fps.shape)

# %%